diff --git a/.pre-commit-hooks/copyright-check.hook b/.pre-commit-hooks/copyright-check.hook
index 26044c29e4fdc827abb4ba2d415db66c780fd366..761edbc018bf840626367d4bed5b0083e0021275 100644
--- a/.pre-commit-hooks/copyright-check.hook
+++ b/.pre-commit-hooks/copyright-check.hook
@@ -19,7 +19,7 @@ import subprocess
import platform
COPYRIGHT = '''
-Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
diff --git a/README.md b/README.md
index a43e21bd2e4970b4da004e98a3c2e2c076e275d1..c9d4796c8c7c77ff1e0526c3f46a39a24d8794a3 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,8 @@
| Documents
| Models List
| AIStudio Courses
+ | Paper
+ | Gitee
diff --git a/README_cn.md b/README_cn.md
index ed5c6a90dcf2d773002691ed4186282ae67386e8..c751b061dcbbfd4ee8c66bf494e006028a0a9ae1 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -25,6 +25,8 @@
| 教程文档
| 模型列表
| AIStudio 课程
+ | 论文
+ | Gitee
diff --git a/audio/paddleaudio/metric/__init__.py b/audio/paddleaudio/metric/__init__.py
index d2b3a1360f5603d3c5eadf273ef9a98a61e5a485..7ce6f5cfffda1f475c2cc6b2734c98027957d123 100644
--- a/audio/paddleaudio/metric/__init__.py
+++ b/audio/paddleaudio/metric/__init__.py
@@ -11,6 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from .dtw import dtw_distance
from .eer import compute_eer
from .eer import compute_minDCF
diff --git a/audio/paddleaudio/metric/dtw.py b/audio/paddleaudio/metric/dtw.py
deleted file mode 100644
index 662e4506d03fcbdd229b547a2a4d12c09667bb5f..0000000000000000000000000000000000000000
--- a/audio/paddleaudio/metric/dtw.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-from dtaidistance import dtw_ndim
-
-__all__ = [
- 'dtw_distance',
-]
-
-
-def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float:
- """Dynamic Time Warping.
- This function keeps a compact matrix, not the full warping paths matrix.
- Uses dynamic programming to compute:
-
- Examples:
- .. code-block:: python
-
- wps[i, j] = (s1[i]-s2[j])**2 + min(
- wps[i-1, j ] + penalty, // vertical / insertion / expansion
- wps[i , j-1] + penalty, // horizontal / deletion / compression
- wps[i-1, j-1]) // diagonal / match
-
- dtw = sqrt(wps[-1, -1])
-
- Args:
- xs (np.ndarray): ref sequence, [T,D]
- ys (np.ndarray): hyp sequence, [T,D]
-
- Returns:
- float: dtw distance
- """
- return dtw_ndim.distance(xs, ys)
diff --git a/audio/setup.py b/audio/setup.py
index ec67c81def776d25e86800ef3606093e91e4c2ef..80fe07b7a91791aafd5f671f9456899178d44531 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -83,7 +83,7 @@ setuptools.setup(
python_requires='>=3.6',
install_requires=[
'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
- 'soundfile >= 0.9.0', 'colorlog', 'dtaidistance == 2.3.1', 'pathos'
+ 'soundfile >= 0.9.0', 'colorlog', 'pathos == 0.2.8'
],
extras_require={
'test': [
diff --git a/demos/README.md b/demos/README.md
index 8abd67249d7ad939db6d79d7b8160b8efa7cb8ba..2a306df6b1e1b3648b7306506adc01d5e0ffcdf2 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -2,14 +2,14 @@
([简体中文](./README_cn.md)|English)
-The directory containes many speech applications in multi scenarios.
+This directory contains many speech applications in multiple scenarios.
* audio searching - mass audio similarity retrieval
* audio tagging - multi-label tagging of an audio file
-* automatic_video_subtitiles - generate subtitles from a video
+* automatic_video_subtitles - generate subtitles from a video
* metaverse - 2D AR with TTS
* punctuation_restoration - restore punctuation from raw text
-* speech recogintion - recognize text of an audio file
+* speech recognition - recognize text of an audio file
* speech server - Server for Speech Task, e.g. ASR,TTS,CLS
* streaming asr server - receive audio stream from websocket, and recognize to transcript.
* speech translation - end to end speech translation
diff --git a/demos/audio_searching/src/encode.py b/demos/audio_searching/src/encode.py
index c89a11c1f86b86a32dd70477a27e3842dffccfe9..f6bcb00adda6fb7b6d07327a860b7a6e877d352e 100644
--- a/demos/audio_searching/src/encode.py
+++ b/demos/audio_searching/src/encode.py
@@ -14,7 +14,7 @@
import numpy as np
from logs import LOGGER
-from paddlespeech.cli import VectorExecutor
+from paddlespeech.cli.vector import VectorExecutor
vector_executor = VectorExecutor()
diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md
index 9d4af0be6c286ecb5ea09baa9178217d67c3f20c..fc4a334ea053d30402cf112848ae1cd81f9f1f57 100644
--- a/demos/audio_tagging/README.md
+++ b/demos/audio_tagging/README.md
@@ -57,7 +57,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespe
- Python API
```python
import paddle
- from paddlespeech.cli import CLSExecutor
+ from paddlespeech.cli.cls import CLSExecutor
cls_executor = CLSExecutor()
result = cls_executor(
diff --git a/demos/audio_tagging/README_cn.md b/demos/audio_tagging/README_cn.md
index 79f87bf8c224e4d0c57368130d04fbc7b32d811e..36b5d8aaf231342f5addd4ba31129881ea1f19f5 100644
--- a/demos/audio_tagging/README_cn.md
+++ b/demos/audio_tagging/README_cn.md
@@ -57,7 +57,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespe
- Python API
```python
import paddle
- from paddlespeech.cli import CLSExecutor
+ from paddlespeech.cli.cls import CLSExecutor
cls_executor = CLSExecutor()
result = cls_executor(
diff --git a/demos/automatic_video_subtitiles/README.md b/demos/automatic_video_subtitiles/README.md
index db6da40db0add1b61d95aecc389d645f001b735f..b815425ec426b1b6b1bc66e1235b52afb7dffa0e 100644
--- a/demos/automatic_video_subtitiles/README.md
+++ b/demos/automatic_video_subtitiles/README.md
@@ -28,7 +28,8 @@ ffmpeg -i subtitle_demo1.mp4 -ac 1 -ar 16000 -vn input.wav
- Python API
```python
import paddle
- from paddlespeech.cli import ASRExecutor, TextExecutor
+ from paddlespeech.cli.asr import ASRExecutor
+ from paddlespeech.cli.text import TextExecutor
asr_executor = ASRExecutor()
text_executor = TextExecutor()
diff --git a/demos/automatic_video_subtitiles/README_cn.md b/demos/automatic_video_subtitiles/README_cn.md
index fc7b2cf6a04203dee389f4e2d640d3a3de57cc3a..990ff6dbdf0cb2e0c31ed296b44783a4b6254711 100644
--- a/demos/automatic_video_subtitiles/README_cn.md
+++ b/demos/automatic_video_subtitiles/README_cn.md
@@ -23,7 +23,8 @@ ffmpeg -i subtitle_demo1.mp4 -ac 1 -ar 16000 -vn input.wav
- Python API
```python
import paddle
- from paddlespeech.cli import ASRExecutor, TextExecutor
+ from paddlespeech.cli.asr import ASRExecutor
+ from paddlespeech.cli.text import TextExecutor
asr_executor = ASRExecutor()
text_executor = TextExecutor()
diff --git a/demos/automatic_video_subtitiles/recognize.py b/demos/automatic_video_subtitiles/recognize.py
index 72e3c3a8597283916762d92dcc9f5e95c6261d2f..304599d1990c4da5e37df803afb8f65aef033c30 100644
--- a/demos/automatic_video_subtitiles/recognize.py
+++ b/demos/automatic_video_subtitiles/recognize.py
@@ -16,8 +16,8 @@ import os
import paddle
-from paddlespeech.cli import ASRExecutor
-from paddlespeech.cli import TextExecutor
+from paddlespeech.cli.asr import ASRExecutor
+from paddlespeech.cli.text import TextExecutor
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
diff --git a/demos/punctuation_restoration/README.md b/demos/punctuation_restoration/README.md
index 518d437dc20a34251e24f58298b0f49dcd57e967..458ab92f9b230069f11464afe1ff2b5331a08c73 100644
--- a/demos/punctuation_restoration/README.md
+++ b/demos/punctuation_restoration/README.md
@@ -42,7 +42,7 @@ The input of this demo should be a text of the specific language that can be pas
- Python API
```python
import paddle
- from paddlespeech.cli import TextExecutor
+ from paddlespeech.cli.text import TextExecutor
text_executor = TextExecutor()
result = text_executor(
diff --git a/demos/punctuation_restoration/README_cn.md b/demos/punctuation_restoration/README_cn.md
index 9d4be8bf08a8d512cd4189367cca9cd8590a60f4..f25acdadbd31385e13d64677ea7a39f3ce5f22c9 100644
--- a/demos/punctuation_restoration/README_cn.md
+++ b/demos/punctuation_restoration/README_cn.md
@@ -44,7 +44,7 @@
- Python API
```python
import paddle
- from paddlespeech.cli import TextExecutor
+ from paddlespeech.cli.text import TextExecutor
text_executor = TextExecutor()
result = text_executor(
diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md
index 63dc9294ec7a8da7abe11465502c466e9ca40e8a..900b5ae4088a4839866a22c094bf159f4a07af6f 100644
--- a/demos/speaker_verification/README.md
+++ b/demos/speaker_verification/README.md
@@ -96,7 +96,7 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
- Python API
```python
- from paddlespeech.cli import VectorExecutor
+ from paddlespeech.cli.vector import VectorExecutor
vector_executor = VectorExecutor()
audio_emb = vector_executor(
diff --git a/demos/speaker_verification/README_cn.md b/demos/speaker_verification/README_cn.md
index 07eeac2ee8a469df4b86b258a14df0f3430d7e58..f6afa86ac832e88fa07d6673041c97e1668bc12e 100644
--- a/demos/speaker_verification/README_cn.md
+++ b/demos/speaker_verification/README_cn.md
@@ -95,7 +95,7 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
- Python API
```python
import paddle
- from paddlespeech.cli import VectorExecutor
+ from paddlespeech.cli.vector import VectorExecutor
vector_executor = VectorExecutor()
audio_emb = vector_executor(
diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index 6493e8e613800ea163b8669842c93a7dd82d68ac..c815a88af2de7876404dd6f929e27f37bb3f2edb 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -58,7 +58,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
- Python API
```python
import paddle
- from paddlespeech.cli import ASRExecutor
+ from paddlespeech.cli.asr import ASRExecutor
asr_executor = ASRExecutor()
text = asr_executor(
diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md
index 8d631d89ca1d61196cbf167b3f263cfd478fb571..13aa9f27755482294d4aed8fc25d75f485289224 100644
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@@ -56,7 +56,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
- Python API
```python
import paddle
- from paddlespeech.cli import ASRExecutor
+ from paddlespeech.cli.asr import ASRExecutor
asr_executor = ASRExecutor()
text = asr_executor(
diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md
index f675a4eda0a57b9c33633d9b9a9c619a99e8e712..00a9c79324a15f54326454c13cf0c715472ac3a2 100644
--- a/demos/speech_translation/README.md
+++ b/demos/speech_translation/README.md
@@ -47,7 +47,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
- Python API
```python
import paddle
- from paddlespeech.cli import STExecutor
+ from paddlespeech.cli.st import STExecutor
st_executor = STExecutor()
text = st_executor(
diff --git a/demos/speech_translation/README_cn.md b/demos/speech_translation/README_cn.md
index bad9b392f9122d02c9fa3c365d5fc54bb82ed492..5119bf9f4eea5a164966ad0a5c9e1c71768ad55e 100644
--- a/demos/speech_translation/README_cn.md
+++ b/demos/speech_translation/README_cn.md
@@ -47,7 +47,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
- Python API
```python
import paddle
- from paddlespeech.cli import STExecutor
+ from paddlespeech.cli.st import STExecutor
st_executor = STExecutor()
text = st_executor(
diff --git a/demos/streaming_asr_server/conf/ws_conformer_application.yaml b/demos/streaming_asr_server/conf/ws_conformer_application.yaml
index 2affde0739ff5873a88cbe621ebf907ab0663dcb..6a10741bd7c0fb079387086f6abe25e25e0d4539 100644
--- a/demos/streaming_asr_server/conf/ws_conformer_application.yaml
+++ b/demos/streaming_asr_server/conf/ws_conformer_application.yaml
@@ -4,7 +4,7 @@
# SERVER SETTING #
#################################################################################
host: 0.0.0.0
-port: 8090
+port: 8091
# The task format in the engin_list is: _
# task choices = ['asr_online']
diff --git a/demos/streaming_asr_server/conf/ws_application.yaml b/demos/streaming_asr_server/conf/ws_ds2_application.yaml
similarity index 100%
rename from demos/streaming_asr_server/conf/ws_application.yaml
rename to demos/streaming_asr_server/conf/ws_ds2_application.yaml
diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md
index 2df72a82dec88ddc55505c9575721aee2de09536..389847a129b8d10787679c4442b8a1999ca044c5 100644
--- a/demos/text_to_speech/README.md
+++ b/demos/text_to_speech/README.md
@@ -77,7 +77,7 @@ The input of this demo should be a text of the specific language that can be pas
- Python API
```python
import paddle
- from paddlespeech.cli import TTSExecutor
+ from paddlespeech.cli.tts import TTSExecutor
tts_executor = TTSExecutor()
wav_file = tts_executor(
diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md
index 7e02b962483b4b0959fa9b9fe0c082bb0a6fdc3e..f967d3d4da47647037ac7a035b3c0ca930762691 100644
--- a/demos/text_to_speech/README_cn.md
+++ b/demos/text_to_speech/README_cn.md
@@ -80,7 +80,7 @@
- Python API
```python
import paddle
- from paddlespeech.cli import TTSExecutor
+ from paddlespeech.cli.tts import TTSExecutor
tts_executor = TTSExecutor()
wav_file = tts_executor(
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 67e7b62e184600ea82aaf51fb6cf0839c75df246..f341dec3462dd8509f56e00788ce9f9e514a2766 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -88,11 +88,3 @@ ECAPA-TDNN | VoxCeleb| [voxceleb_ecapatdnn](https://github.com/PaddlePaddle/Padd
Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----:
Ernie Linear | IWLST2012_zh |[iwslt2012_punc0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/iwslt2012/punc0)|[ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip)
-
-## Speech Recognition Model from paddle 1.8
-
-| Acoustic Model |Training Data| Token-based | Size | Descriptions | CER | WER | Hours of speech |
-| :-----:| :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
-| [Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz) | Aishell Dataset | Char-based | 234 MB | 2 Conv + 3 bidirectional GRU layers | 0.0804 | — | 151 h |
-| [Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz) | Librispeech Dataset | Word-based | 307 MB | 2 Conv + 3 bidirectional sharing weight RNN layers | — | 0.0685 | 960 h |
-| [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz) | Baidu Internal English Dataset | Word-based | 273 MB | 2 Conv + 3 bidirectional GRU layers |— | 0.0541 | 8628 h|
diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index 9c505679c91ca9dad9cf7d5f7dee0ec7970a682d..31c99898ccc7839b835a0fbd7daec550a36de340 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -113,12 +113,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -127,11 +127,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -143,10 +142,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -162,12 +161,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -177,11 +176,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -192,10 +190,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -208,9 +206,9 @@ optional arguments:
output dir.
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md
index e9e012d29ea95cda33eaccf8201bb0d74a90a9f5..a3daf3dfd6889e60f9d71be396bc9b3d2404fe54 100644
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -68,7 +68,7 @@ Train a ParallelWaveGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG ParallelWaveGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md
index 84bcd78ef0ca56ef385f56173d866f1cf4c64bfd..c3e3197d63c0e871dabd61e67992335fc8d5d1f9 100644
--- a/examples/aishell3/voc5/README.md
+++ b/examples/aishell3/voc5/README.md
@@ -59,15 +59,13 @@ Here's the complete help message.
```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
- [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
- [--run-benchmark RUN_BENCHMARK]
- [--profiler_options PROFILER_OPTIONS]
+ [--ngpu NGPU]
-Train a ParallelWaveGAN model.
+Train a HiFiGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG HiFiGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
@@ -75,19 +73,6 @@ optional arguments:
--output-dir OUTPUT_DIR
output dir.
--ngpu NGPU if ngpu == 0, use cpu.
-
-benchmark:
- arguments related to benchmark.
-
- --batch-size BATCH_SIZE
- batch size.
- --max-iter MAX_ITER train max steps.
- --run-benchmark RUN_BENCHMARK
- runing benchmark or not, if True, use the --batch-size
- and --max-iter.
- --profiler_options PROFILER_OPTIONS
- The option of profiler, which should be in format
- "key1=value1;key2=value2;key3=value3".
```
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md
index d62c901174ae2ed8e0bb14f93c76f97939b33499..bc7769d1572166b6e27185f282b8ca16b998f40f 100644
--- a/examples/csmsc/tts0/README.md
+++ b/examples/csmsc/tts0/README.md
@@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -133,10 +132,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -182,10 +180,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -198,9 +196,9 @@ optional arguments:
output dir.
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index 1bcfb383f200b6d454eabac2b85e4b23b32780d1..f45561719ba88485e6da2c20674da215f06d4093 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -139,10 +138,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -188,10 +186,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -204,9 +202,9 @@ optional arguments:
output dir.
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 1f7dfa0fdd31526f1bf05bca51ed953e6c57227e..371034e772391196ab1b455b0c5f42e86df82b35 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -111,12 +111,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -125,11 +125,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -141,10 +140,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -160,12 +159,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -175,11 +174,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -190,10 +188,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -204,11 +202,12 @@ optional arguments:
--text TEXT text to synthesize, a 'utt_id sentence' pair per line.
--output_dir OUTPUT_DIR
output dir.
+
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/csmsc/tts3/README_cn.md b/examples/csmsc/tts3/README_cn.md
index f08ca724c8a7b48bb5783c0b5e37d7e64a4d6595..1829b77063e0ec08f5bd8dcb41405ea815d3ec0e 100644
--- a/examples/csmsc/tts3/README_cn.md
+++ b/examples/csmsc/tts3/README_cn.md
@@ -117,12 +117,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -131,11 +131,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -147,10 +146,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -167,12 +166,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -182,11 +181,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -197,10 +195,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -213,9 +211,9 @@ optional arguments:
output dir.
```
1. `--am` 声学模型格式是否符合 {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。
+2. `--am_config`, `--am_ckpt`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。
3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。
5. `--lang` 对应模型的语言可以是 `zh` 或 `en` 。
6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、
7. `--text` 是文本文件,其中包含要合成的句子。
diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0c16840a04e32be8fefb3bae6c23fb4bd853be9f 100644
--- a/examples/csmsc/vits/README.md
+++ b/examples/csmsc/vits/README.md
@@ -0,0 +1,146 @@
+# VITS with CSMSC
+This example contains code used to train a [VITS](https://arxiv.org/abs/2106.06103) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+
+## Dataset
+### Download and Extract
+Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for VITS, the durations of MFA are not needed here.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+ - synthesize waveform from `metadata.jsonl`.
+ - synthesize waveform from a text file.
+
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│ ├── norm
+│ └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│ ├── norm
+│ └── raw
+└── train
+ ├── feats_stats.npy
+ ├── norm
+ └── raw
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains wave and linear spectrogram of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, feats, feats_lengths, the path of linear spectrogram features, the path of raw waves, speaker, and the id of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+ [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+ [--ngpu NGPU] [--phones-dict PHONES_DICT]
+
+Train a VITS model.
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config CONFIG config file to overwrite default config.
+ --train-metadata TRAIN_METADATA
+ training data.
+ --dev-metadata DEV_METADATA
+ dev data.
+ --output-dir OUTPUT_DIR
+ output dir.
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --phones-dict PHONES_DICT
+ phone vocabulary file.
+```
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+
+### Synthesizing
+
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--config CONFIG] [--ckpt CKPT]
+ [--phones_dict PHONES_DICT] [--ngpu NGPU]
+ [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
+
+Synthesize with VITS
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config CONFIG Config of VITS.
+ --ckpt CKPT Checkpoint file of VITS.
+ --phones_dict PHONES_DICT
+ phone vocabulary file.
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --test_metadata TEST_METADATA
+ test metadata.
+ --output_dir OUTPUT_DIR
+ output dir.
+```
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h] [--config CONFIG] [--ckpt CKPT]
+ [--phones_dict PHONES_DICT] [--lang LANG]
+ [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+ [--text TEXT] [--output_dir OUTPUT_DIR]
+
+Synthesize with VITS
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config CONFIG Config of VITS.
+ --ckpt CKPT Checkpoint file of VITS.
+ --phones_dict PHONES_DICT
+ phone vocabulary file.
+ --lang LANG Choose model language. zh or en
+ --inference_dir INFERENCE_DIR
+ dir to save inference models
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --text TEXT text to synthesize, a 'utt_id sentence' pair per line.
+ --output_dir OUTPUT_DIR
+ output dir.
+```
+1. `--config`, `--ckpt`, and `--phones_dict` are arguments for acoustic model, which correspond to the 3 files in the VITS pretrained model.
+2. `--lang` is the model language, which can be `zh` or `en`.
+3. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
+4. `--text` is the text file, which contains sentences to synthesize.
+5. `--output_dir` is the directory to save synthesized audio files.
+6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Model
diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index d5bec1cd79d21e311d1859b4ef1dde1d59400788..4646a034599a6f71e613b17d42810c267d870df8 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -65,7 +65,7 @@ Train a ParallelWaveGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG ParallelWaveGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index e188bcb35787a25ddad2f0318238c923a1ff014f..09fb8836c58c03fd000090ab40d04937dcd51c35 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -63,7 +63,7 @@ Train a Multi-Band MelGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG Multi-Band MelGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md
index 19836134e15c676a05ff35c8ac6cc6af3d1605a5..f1a132a84d7a83bba7cc5d94caea7aa1f0d36398 100644
--- a/examples/csmsc/voc4/README.md
+++ b/examples/csmsc/voc4/README.md
@@ -63,7 +63,7 @@ Train a Style MelGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG Style MelGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md
index 4c38b5987c9d5fed287d205e1224aa0a511449d3..ef552fd3078e00c5ac030e8a2e732043783e3f19 100644
--- a/examples/csmsc/voc5/README.md
+++ b/examples/csmsc/voc5/README.md
@@ -63,7 +63,7 @@ Train a HiFiGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG HiFiGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md
index 0e5ce633411f1e58dec48ea2141575d5a77353f9..b48c36414b607732ed66741dedb2f27270ba6e84 100644
--- a/examples/csmsc/voc6/README.md
+++ b/examples/csmsc/voc6/README.md
@@ -63,7 +63,7 @@ Train a WaveRNN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG WaveRNN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md
index e3292957b09b8226a01515c01e6c05eb130daca3..85d9e448b8422e32d9058a18d1ac413aff69cce7 100644
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -133,10 +132,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -182,10 +180,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -198,9 +196,9 @@ optional arguments:
output dir.
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md
index 9f82185cada1d4873707bb32f36ac4b080848082..85621653f761b05ac382efe83e5f1daa694a06e6 100644
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@@ -58,7 +58,7 @@ Train a TransformerTTS model with LJSpeech TTS dataset.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG TransformerTTS config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index 8a666193fd5bf4fbd34c01b65e4fc99717ea8686..81a0580c0a7d658e87ec09a436cb3e48de00fea5 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -107,14 +107,14 @@ pwg_ljspeech_ckpt_0.5
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
-``text
+```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -139,10 +138,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -188,10 +186,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -204,9 +202,9 @@ optional arguments:
output dir.
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 491444261d32df254271e319dc283ab91742f1ba..d16c0e35fb2fcf13e1ec52ef608db925dc945f51 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -65,7 +65,7 @@ Train a ParallelWaveGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG ParallelWaveGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md
index 8305150422b523155e9d4b62a434e08890eedaa7..d856cfecfdebed32cd33b6ef90285c3c1ec5299a 100644
--- a/examples/ljspeech/voc5/README.md
+++ b/examples/ljspeech/voc5/README.md
@@ -57,15 +57,13 @@ Here's the complete help message.
```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
- [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
- [--run-benchmark RUN_BENCHMARK]
- [--profiler_options PROFILER_OPTIONS]
+ [--ngpu NGPU]
-Train a ParallelWaveGAN model.
+Train a HiFiGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG HiFiGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
@@ -73,19 +71,6 @@ optional arguments:
--output-dir OUTPUT_DIR
output dir.
--ngpu NGPU if ngpu == 0, use cpu.
-
-benchmark:
- arguments related to benchmark.
-
- --batch-size BATCH_SIZE
- batch size.
- --max-iter MAX_ITER train max steps.
- --run-benchmark RUN_BENCHMARK
- runing benchmark or not, if True, use the --batch-size
- and --max-iter.
- --profiler_options PROFILER_OPTIONS
- The option of profiler, which should be in format
- "key1=value1;key2=value2;key3=value3".
```
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
diff --git a/examples/other/1xt2x/.gitignore b/examples/other/1xt2x/.gitignore
deleted file mode 100644
index a9a5aecf429fd8a0d81fbd5fd37006bfa498d5c1..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-tmp
diff --git a/examples/other/1xt2x/README.md b/examples/other/1xt2x/README.md
deleted file mode 100644
index 49f850d2636da9213e8ca0bf4a4fe021a228976a..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# 1xt2x
-
-Convert Deepspeech 1.8 released model to 2.x.
-
-## Model source directory
-* Deepspeech2x
-
-## Expriment directory
-* aishell
-* librispeech
-* baidu_en8k
-
-# The released model
-
-Acoustic Model | Training Data | Hours of Speech | Token-based | CER | WER
-:-------------:| :------------:| :---------------: | :---------: | :---: | :----:
-Ds2 Offline Aishell 1xt2x model| Aishell Dataset | 151 h | Char-based | 0.080447 |
-Ds2 Offline Librispeech 1xt2x model | Librispeech Dataset | 960 h | Word-based | | 0.068548
-Ds2 Offline Baidu en8k 1x2x model | Baidu Internal English Dataset | 8628 h |Word-based | | 0.054112
diff --git a/examples/other/1xt2x/aishell/.gitignore b/examples/other/1xt2x/aishell/.gitignore
deleted file mode 100644
index 3631e544a48bf5b3dd9eb7ebef5074cfe21ec78f..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/aishell/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-exp
-data
-*log
-tmp
-nohup*
diff --git a/examples/other/1xt2x/aishell/conf/augmentation.json b/examples/other/1xt2x/aishell/conf/augmentation.json
deleted file mode 100644
index fe51488c7066f6687ef680d6bfaa4f7768ef205c..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/aishell/conf/augmentation.json
+++ /dev/null
@@ -1 +0,0 @@
-[]
diff --git a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml
deleted file mode 100644
index c2db2c7c2d20a232f330d6cd44136c4951981b72..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-# https://yaml.org/type/float.html
-###########################################
-# Data #
-###########################################
-train_manifest: data/manifest.train
-dev_manifest: data/manifest.dev
-test_manifest: data/manifest.test
-min_input_len: 0.0
-max_input_len: 27.0 # second
-min_output_len: 0.0
-max_output_len: .inf
-min_output_input_ratio: 0.00
-max_output_input_ratio: .inf
-
-###########################################
-# Dataloader #
-###########################################
-batch_size: 64 # one gpu
-mean_std_filepath: data/mean_std.npz
-unit_type: char
-vocab_filepath: data/vocab.txt
-augmentation_config: conf/augmentation.json
-random_seed: 0
-spm_model_prefix:
-spectrum_type: linear
-feat_dim:
-delta_delta: False
-stride_ms: 10.0
-window_ms: 20.0
-n_fft: None
-max_freq: None
-target_sample_rate: 16000
-use_dB_normalization: True
-target_dB: -20
-dither: 1.0
-keep_transcription_text: False
-sortagrad: True
-shuffle_method: batch_shuffle
-num_workers: 2
-
-############################################
-# Network Architecture #
-############################################
-num_conv_layers: 2
-num_rnn_layers: 3
-rnn_layer_size: 1024
-use_gru: True
-share_rnn_weights: False
-blank_id: 4333
-
-###########################################
-# Training #
-###########################################
-n_epoch: 80
-accum_grad: 1
-lr: 2e-3
-lr_decay: 0.83
-weight_decay: 1e-06
-global_grad_clip: 3.0
-log_interval: 100
-checkpoint:
- kbest_n: 50
- latest_n: 5
-
-
diff --git a/examples/other/1xt2x/aishell/conf/tuning/decode.yaml b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml
deleted file mode 100644
index b5283a934a235c676c8780af0c81e2fd1d231c5e..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/aishell/conf/tuning/decode.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-decode_batch_size: 32
-error_rate_type: cer
-decoding_method: ctc_beam_search
-lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
-alpha: 2.6
-beta: 5.0
-beam_size: 300
-cutoff_prob: 0.99
-cutoff_top_n: 40
-num_proc_bsearch: 8
\ No newline at end of file
diff --git a/examples/other/1xt2x/aishell/local/data.sh b/examples/other/1xt2x/aishell/local/data.sh
deleted file mode 100755
index a9d5b1412a5117812e37fbdbc2e168732c8d0528..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/aishell/local/data.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-if [ $# != 1 ];then
- echo "usage: ${0} ckpt_dir"
- exit -1
-fi
-
-ckpt_dir=$1
-
-stage=-1
-stop_stage=100
-
-source ${MAIN_ROOT}/utils/parse_options.sh
-
-mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/dataset
-mkdir -p ${TARGET_DIR}
-
-bash local/download_model.sh ${ckpt_dir}
-if [ $? -ne 0 ]; then
- exit 1
-fi
-
-cd ${ckpt_dir}
-tar xzvf aishell_model_v1.8_to_v2.x.tar.gz
-cd -
-mv ${ckpt_dir}/mean_std.npz data/
-mv ${ckpt_dir}/vocab.txt data/
-
-
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
- # download data, generate manifests
- python3 ${TARGET_DIR}/aishell/aishell.py \
- --manifest_prefix="data/manifest" \
- --target_dir="${TARGET_DIR}/aishell"
-
- if [ $? -ne 0 ]; then
- echo "Prepare Aishell failed. Terminated."
- exit 1
- fi
-
- for dataset in train dev test; do
- mv data/manifest.${dataset} data/manifest.${dataset}.raw
- done
-fi
-
-
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # format manifest with tokenids, vocab size
- for dataset in train dev test; do
- {
- python3 ${MAIN_ROOT}/utils/format_data.py \
- --cmvn_path "data/mean_std.npz" \
- --unit_type "char" \
- --vocab_path="data/vocab.txt" \
- --manifest_path="data/manifest.${dataset}.raw" \
- --output_path="data/manifest.${dataset}"
-
- if [ $? -ne 0 ]; then
- echo "Formt mnaifest failed. Terminated."
- exit 1
- fi
- } &
- done
- wait
-fi
-
-echo "Aishell data preparation done."
-exit 0
diff --git a/examples/other/1xt2x/aishell/local/download_lm_ch.sh b/examples/other/1xt2x/aishell/local/download_lm_ch.sh
deleted file mode 100755
index 47153f4b60280916663b3e5e9012201cb8ad3e89..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/aishell/local/download_lm_ch.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-DIR=data/lm
-mkdir -p ${DIR}
-
-URL='https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
-MD5="29e02312deb2e59b3c8686c7966d4fe3"
-TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
-
-
-echo "Start downloading the language model. The language model is large, please wait for a moment ..."
-download $URL $MD5 $TARGET > /dev/null 2>&1
-if [ $? -ne 0 ]; then
- echo "Fail to download the language model!"
- exit 1
-else
- echo "Download the language model sucessfully"
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/aishell/local/download_model.sh b/examples/other/1xt2x/aishell/local/download_model.sh
deleted file mode 100644
index ffa2f8101b3c3e6f681fb7beb760d6e36fd27dca..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/aishell/local/download_model.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#! /usr/bin/env bash
-
-if [ $# != 1 ];then
- echo "usage: ${0} ckpt_dir"
- exit -1
-fi
-
-ckpt_dir=$1
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-URL='https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz'
-MD5=87e7577d4bea737dbf3e8daab37aa808
-TARGET=${ckpt_dir}/aishell_model_v1.8_to_v2.x.tar.gz
-
-
-echo "Download Aishell model ..."
-download $URL $MD5 $TARGET
-if [ $? -ne 0 ]; then
- echo "Fail to download Aishell model!"
- exit 1
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/aishell/local/test.sh b/examples/other/1xt2x/aishell/local/test.sh
deleted file mode 100755
index 463593ef389c0256344ba2c40b7cdff426af1248..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/aishell/local/test.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-if [ $# != 4 ];then
- echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
- exit -1
-fi
-
-ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-echo "using $ngpu gpus..."
-
-config_path=$1
-decode_config_path=$2
-ckpt_prefix=$3
-model_type=$4
-
-# download language model
-bash local/download_lm_ch.sh
-if [ $? -ne 0 ]; then
- exit 1
-fi
-
-python3 -u ${BIN_DIR}/test.py \
---ngpu ${ngpu} \
---config ${config_path} \
---decode_cfg ${decode_config_path} \
---result_file ${ckpt_prefix}.rsl \
---checkpoint_path ${ckpt_prefix} \
---model_type ${model_type}
-
-if [ $? -ne 0 ]; then
- echo "Failed in evaluation!"
- exit 1
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/aishell/path.sh b/examples/other/1xt2x/aishell/path.sh
deleted file mode 100644
index ce44e65cbbee40d76f0629240d0e38728cf48f55..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/aishell/path.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-export MAIN_ROOT=`realpath ${PWD}/../../../../`
-export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-export PYTHONPATH=${LOCAL_DEEPSPEECH2}:${PYTHONPATH}
-
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
-
-MODEL=deepspeech2
-export BIN_DIR=${LOCAL_DEEPSPEECH2}/src_deepspeech2x/bin
-echo "BIN_DIR "${BIN_DIR}
diff --git a/examples/other/1xt2x/aishell/run.sh b/examples/other/1xt2x/aishell/run.sh
deleted file mode 100755
index 89a634119ca76f96e8b59d38df0c822ff511d0ee..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/aishell/run.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-set -e
-source path.sh
-
-stage=0
-stop_stage=100
-conf_path=conf/deepspeech2.yaml
-decode_conf_path=conf/tuning/decode.yaml
-avg_num=1
-model_type=offline
-gpus=2
-
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
-
-v18_ckpt=aishell_v1.8
-ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
-echo "checkpoint name ${ckpt}"
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- # prepare data
- mkdir -p exp/${ckpt}/checkpoints
- bash ./local/data.sh exp/${ckpt}/checkpoints || exit -1
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # test ckpt avg_n
- CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
-fi
-
diff --git a/examples/other/1xt2x/baidu_en8k/.gitignore b/examples/other/1xt2x/baidu_en8k/.gitignore
deleted file mode 100644
index 3631e544a48bf5b3dd9eb7ebef5074cfe21ec78f..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/baidu_en8k/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-exp
-data
-*log
-tmp
-nohup*
diff --git a/examples/other/1xt2x/baidu_en8k/conf/augmentation.json b/examples/other/1xt2x/baidu_en8k/conf/augmentation.json
deleted file mode 100644
index fe51488c7066f6687ef680d6bfaa4f7768ef205c..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/baidu_en8k/conf/augmentation.json
+++ /dev/null
@@ -1 +0,0 @@
-[]
diff --git a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml
deleted file mode 100644
index 0c08fbc635dc574cc186c7494948d2d9df367290..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-# https://yaml.org/type/float.html
-###########################################
-# Data #
-###########################################
-train_manifest: data/manifest.train
-dev_manifest: data/manifest.dev
-test_manifest: data/manifest.test-clean
-min_input_len: 0.0
-max_input_len: .inf # second
-min_output_len: 0.0
-max_output_len: .inf
-min_output_input_ratio: 0.00
-max_output_input_ratio: .inf
-
-###########################################
-# Dataloader #
-###########################################
-batch_size: 64 # one gpu
-mean_std_filepath: data/mean_std.npz
-unit_type: char
-vocab_filepath: data/vocab.txt
-augmentation_config: conf/augmentation.json
-random_seed: 0
-spm_model_prefix:
-spectrum_type: linear
-feat_dim:
-delta_delta: False
-stride_ms: 10.0
-window_ms: 20.0
-n_fft: None
-max_freq: None
-target_sample_rate: 16000
-use_dB_normalization: True
-target_dB: -20
-dither: 1.0
-keep_transcription_text: False
-sortagrad: True
-shuffle_method: batch_shuffle
-num_workers: 2
-
-############################################
-# Network Architecture #
-############################################
-num_conv_layers: 2
-num_rnn_layers: 3
-rnn_layer_size: 1024
-use_gru: True
-share_rnn_weights: False
-blank_id: 28
-
-###########################################
-# Training #
-###########################################
-n_epoch: 80
-accum_grad: 1
-lr: 2e-3
-lr_decay: 0.83
-weight_decay: 1e-06
-global_grad_clip: 3.0
-log_interval: 100
-checkpoint:
- kbest_n: 50
- latest_n: 5
-
diff --git a/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml
deleted file mode 100644
index f52dde320d62769f9bcea53295f6f24460279431..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-decode_batch_size: 32
-error_rate_type: wer
-decoding_method: ctc_beam_search
-lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-alpha: 1.4
-beta: 0.35
-beam_size: 500
-cutoff_prob: 1.0
-cutoff_top_n: 40
-num_proc_bsearch: 8
\ No newline at end of file
diff --git a/examples/other/1xt2x/baidu_en8k/local/data.sh b/examples/other/1xt2x/baidu_en8k/local/data.sh
deleted file mode 100755
index 9b017324dc512f6b61706db8afb393e5ca88b3f3..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/baidu_en8k/local/data.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-if [ $# != 1 ];then
- echo "usage: ${0} ckpt_dir"
- exit -1
-fi
-
-ckpt_dir=$1
-
-stage=-1
-stop_stage=100
-unit_type=char
-
-source ${MAIN_ROOT}/utils/parse_options.sh
-
-mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/dataset
-mkdir -p ${TARGET_DIR}
-
-
-bash local/download_model.sh ${ckpt_dir}
-if [ $? -ne 0 ]; then
- exit 1
-fi
-
-cd ${ckpt_dir}
-tar xzvf baidu_en8k_v1.8_to_v2.x.tar.gz
-cd -
-mv ${ckpt_dir}/mean_std.npz data/
-mv ${ckpt_dir}/vocab.txt data/
-
-
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
- # download data, generate manifests
- python3 ${TARGET_DIR}/librispeech/librispeech.py \
- --manifest_prefix="data/manifest" \
- --target_dir="${TARGET_DIR}/librispeech" \
- --full_download="True"
-
- if [ $? -ne 0 ]; then
- echo "Prepare LibriSpeech failed. Terminated."
- exit 1
- fi
-
- for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
- mv data/manifest.${set} data/manifest.${set}.raw
- done
-
- rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
- for set in train-clean-100 train-clean-360 train-other-500; do
- cat data/manifest.${set}.raw >> data/manifest.train.raw
- done
-
- for set in dev-clean dev-other; do
- cat data/manifest.${set}.raw >> data/manifest.dev.raw
- done
-
- for set in test-clean test-other; do
- cat data/manifest.${set}.raw >> data/manifest.test.raw
- done
-fi
-
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # format manifest with tokenids, vocab size
- for set in train dev test dev-clean dev-other test-clean test-other; do
- {
- python3 ${MAIN_ROOT}/utils/format_data.py \
- --cmvn_path "data/mean_std.npz" \
- --unit_type ${unit_type} \
- --vocab_path="data/vocab.txt" \
- --manifest_path="data/manifest.${set}.raw" \
- --output_path="data/manifest.${set}"
-
- if [ $? -ne 0 ]; then
- echo "Formt mnaifest.${set} failed. Terminated."
- exit 1
- fi
- }&
- done
- wait
-fi
-
-echo "LibriSpeech Data preparation done."
-exit 0
-
diff --git a/examples/other/1xt2x/baidu_en8k/local/download_lm_en.sh b/examples/other/1xt2x/baidu_en8k/local/download_lm_en.sh
deleted file mode 100755
index 390fffc9300fafb2b441e7eaac6fe3c68c0e8475..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/baidu_en8k/local/download_lm_en.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-DIR=data/lm
-mkdir -p ${DIR}
-
-URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
-MD5="099a601759d467cd0a8523ff939819c5"
-TARGET=${DIR}/common_crawl_00.prune01111.trie.klm
-
-echo "Start downloading the language model. The language model is large, please wait for a moment ..."
-download $URL $MD5 $TARGET > /dev/null 2>&1
-if [ $? -ne 0 ]; then
- echo "Fail to download the language model!"
- exit 1
-else
- echo "Download the language model sucessfully"
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/baidu_en8k/local/download_model.sh b/examples/other/1xt2x/baidu_en8k/local/download_model.sh
deleted file mode 100644
index a8fbc31e8a36ecdf520f745a1b27bd38031db921..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/baidu_en8k/local/download_model.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#! /usr/bin/env bash
-if [ $# != 1 ];then
- echo "usage: ${0} ckpt_dir"
- exit -1
-fi
-
-ckpt_dir=$1
-
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-URL='https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz'
-MD5=c1676be8505cee436e6f312823e9008c
-TARGET=${ckpt_dir}/baidu_en8k_v1.8_to_v2.x.tar.gz
-
-
-echo "Download BaiduEn8k model ..."
-download $URL $MD5 $TARGET
-if [ $? -ne 0 ]; then
- echo "Fail to download BaiduEn8k model!"
- exit 1
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/baidu_en8k/local/test.sh b/examples/other/1xt2x/baidu_en8k/local/test.sh
deleted file mode 100755
index ea40046b10997ee425d4e654b89fedc732c8b3fe..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/baidu_en8k/local/test.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-if [ $# != 4 ];then
- echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
- exit -1
-fi
-
-ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-echo "using $ngpu gpus..."
-
-config_path=$1
-decode_config_path=$2
-ckpt_prefix=$3
-model_type=$4
-
-# download language model
-bash local/download_lm_en.sh
-if [ $? -ne 0 ]; then
- exit 1
-fi
-
-python3 -u ${BIN_DIR}/test.py \
---ngpu ${ngpu} \
---config ${config_path} \
---decode_cfg ${decode_config_path} \
---result_file ${ckpt_prefix}.rsl \
---checkpoint_path ${ckpt_prefix} \
---model_type ${model_type}
-
-if [ $? -ne 0 ]; then
- echo "Failed in evaluation!"
- exit 1
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/baidu_en8k/path.sh b/examples/other/1xt2x/baidu_en8k/path.sh
deleted file mode 100644
index ce44e65cbbee40d76f0629240d0e38728cf48f55..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/baidu_en8k/path.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-export MAIN_ROOT=`realpath ${PWD}/../../../../`
-export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-export PYTHONPATH=${LOCAL_DEEPSPEECH2}:${PYTHONPATH}
-
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
-
-MODEL=deepspeech2
-export BIN_DIR=${LOCAL_DEEPSPEECH2}/src_deepspeech2x/bin
-echo "BIN_DIR "${BIN_DIR}
diff --git a/examples/other/1xt2x/baidu_en8k/run.sh b/examples/other/1xt2x/baidu_en8k/run.sh
deleted file mode 100755
index 82de56b094f32684c110a49b7ebb144263ae351e..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/baidu_en8k/run.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-set -e
-source path.sh
-
-stage=0
-stop_stage=100
-conf_path=conf/deepspeech2.yaml
-decode_conf_path=conf/tuning/decode.yaml
-avg_num=1
-model_type=offline
-gpus=0
-
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
-
-v18_ckpt=baidu_en8k_v1.8
-ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
-echo "checkpoint name ${ckpt}"
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- # prepare data
- mkdir -p exp/${ckpt}/checkpoints
- bash ./local/data.sh exp/${ckpt}/checkpoints || exit -1
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # test ckpt avg_n
- CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
-fi
-
diff --git a/examples/other/1xt2x/librispeech/.gitignore b/examples/other/1xt2x/librispeech/.gitignore
deleted file mode 100644
index 3631e544a48bf5b3dd9eb7ebef5074cfe21ec78f..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/librispeech/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-exp
-data
-*log
-tmp
-nohup*
diff --git a/examples/other/1xt2x/librispeech/conf/augmentation.json b/examples/other/1xt2x/librispeech/conf/augmentation.json
deleted file mode 100644
index fe51488c7066f6687ef680d6bfaa4f7768ef205c..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/librispeech/conf/augmentation.json
+++ /dev/null
@@ -1 +0,0 @@
-[]
diff --git a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml
deleted file mode 100644
index a2a5649baa0ad80615b721d723d5a7672000fadc..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-# https://yaml.org/type/float.html
-###########################################
-# Data #
-###########################################
-train_manifest: data/manifest.train
-dev_manifest: data/manifest.dev
-test_manifest: data/manifest.test-clean
-min_input_len: 0.0
-max_input_len: 1000.0 # second
-min_output_len: 0.0
-max_output_len: .inf
-min_output_input_ratio: 0.00
-max_output_input_ratio: .inf
-
-###########################################
-# Dataloader #
-###########################################
-batch_size: 64 # one gpu
-mean_std_filepath: data/mean_std.npz
-unit_type: char
-vocab_filepath: data/vocab.txt
-augmentation_config: conf/augmentation.json
-random_seed: 0
-spm_model_prefix:
-spectrum_type: linear
-feat_dim:
-delta_delta: False
-stride_ms: 10.0
-window_ms: 20.0
-n_fft: None
-max_freq: None
-target_sample_rate: 16000
-use_dB_normalization: True
-target_dB: -20
-dither: 1.0
-keep_transcription_text: False
-sortagrad: True
-shuffle_method: batch_shuffle
-num_workers: 2
-
-############################################
-# Network Architecture #
-############################################
-num_conv_layers: 2
-num_rnn_layers: 3
-rnn_layer_size: 2048
-use_gru: False
-share_rnn_weights: True
-blank_id: 28
-
-###########################################
-# Training #
-###########################################
-n_epoch: 80
-accum_grad: 1
-lr: 2e-3
-lr_decay: 0.83
-weight_decay: 1e-06
-global_grad_clip: 3.0
-log_interval: 100
-checkpoint:
- kbest_n: 50
- latest_n: 5
-
diff --git a/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml
deleted file mode 100644
index f3b51defe10b89a32ca3f71621fafc6f1ad15c77..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-decode_batch_size: 32
-error_rate_type: wer
-decoding_method: ctc_beam_search
-lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-alpha: 2.5
-beta: 0.3
-beam_size: 500
-cutoff_prob: 1.0
-cutoff_top_n: 40
-num_proc_bsearch: 8
\ No newline at end of file
diff --git a/examples/other/1xt2x/librispeech/local/data.sh b/examples/other/1xt2x/librispeech/local/data.sh
deleted file mode 100755
index 43b5426d9d9a3bc9e4fbafcf23251fc7793eb080..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/librispeech/local/data.sh
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/bin/bash
-
-if [ $# != 1 ];then
- echo "usage: ${0} ckpt_dir"
- exit -1
-fi
-
-ckpt_dir=$1
-
-stage=-1
-stop_stage=100
-unit_type=char
-
-source ${MAIN_ROOT}/utils/parse_options.sh
-
-mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/dataset
-mkdir -p ${TARGET_DIR}
-
-bash local/download_model.sh ${ckpt_dir}
-if [ $? -ne 0 ]; then
- exit 1
-fi
-
-cd ${ckpt_dir}
-tar xzvf librispeech_v1.8_to_v2.x.tar.gz
-cd -
-mv ${ckpt_dir}/mean_std.npz data/
-mv ${ckpt_dir}/vocab.txt data/
-
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
- # download data, generate manifests
- python3 ${TARGET_DIR}/librispeech/librispeech.py \
- --manifest_prefix="data/manifest" \
- --target_dir="${TARGET_DIR}/librispeech" \
- --full_download="True"
-
- if [ $? -ne 0 ]; then
- echo "Prepare LibriSpeech failed. Terminated."
- exit 1
- fi
-
- for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
- mv data/manifest.${set} data/manifest.${set}.raw
- done
-
- rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
- for set in train-clean-100 train-clean-360 train-other-500; do
- cat data/manifest.${set}.raw >> data/manifest.train.raw
- done
-
- for set in dev-clean dev-other; do
- cat data/manifest.${set}.raw >> data/manifest.dev.raw
- done
-
- for set in test-clean test-other; do
- cat data/manifest.${set}.raw >> data/manifest.test.raw
- done
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # format manifest with tokenids, vocab size
- for set in train dev test dev-clean dev-other test-clean test-other; do
- {
- python3 ${MAIN_ROOT}/utils/format_data.py \
- --cmvn_path "data/mean_std.npz" \
- --unit_type ${unit_type} \
- --vocab_path="data/vocab.txt" \
- --manifest_path="data/manifest.${set}.raw" \
- --output_path="data/manifest.${set}"
-
- if [ $? -ne 0 ]; then
- echo "Formt mnaifest.${set} failed. Terminated."
- exit 1
- fi
- }&
- done
- wait
-fi
-
-echo "LibriSpeech Data preparation done."
-exit 0
-
diff --git a/examples/other/1xt2x/librispeech/local/download_lm_en.sh b/examples/other/1xt2x/librispeech/local/download_lm_en.sh
deleted file mode 100755
index 390fffc9300fafb2b441e7eaac6fe3c68c0e8475..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/librispeech/local/download_lm_en.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-DIR=data/lm
-mkdir -p ${DIR}
-
-URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
-MD5="099a601759d467cd0a8523ff939819c5"
-TARGET=${DIR}/common_crawl_00.prune01111.trie.klm
-
-echo "Start downloading the language model. The language model is large, please wait for a moment ..."
-download $URL $MD5 $TARGET > /dev/null 2>&1
-if [ $? -ne 0 ]; then
- echo "Fail to download the language model!"
- exit 1
-else
- echo "Download the language model sucessfully"
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/librispeech/local/download_model.sh b/examples/other/1xt2x/librispeech/local/download_model.sh
deleted file mode 100644
index 375d66404f6ff2e411188f16277545ee685dda9c..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/librispeech/local/download_model.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#! /usr/bin/env bash
-
-if [ $# != 1 ];then
- echo "usage: ${0} ckpt_dir"
- exit -1
-fi
-
-ckpt_dir=$1
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz'
-MD5=a06d9aadb560ea113984dc98d67232c8
-TARGET=${ckpt_dir}/librispeech_v1.8_to_v2.x.tar.gz
-
-
-echo "Download LibriSpeech model ..."
-download $URL $MD5 $TARGET
-if [ $? -ne 0 ]; then
- echo "Fail to download LibriSpeech model!"
- exit 1
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/librispeech/local/test.sh b/examples/other/1xt2x/librispeech/local/test.sh
deleted file mode 100755
index ea40046b10997ee425d4e654b89fedc732c8b3fe..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/librispeech/local/test.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-if [ $# != 4 ];then
- echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
- exit -1
-fi
-
-ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-echo "using $ngpu gpus..."
-
-config_path=$1
-decode_config_path=$2
-ckpt_prefix=$3
-model_type=$4
-
-# download language model
-bash local/download_lm_en.sh
-if [ $? -ne 0 ]; then
- exit 1
-fi
-
-python3 -u ${BIN_DIR}/test.py \
---ngpu ${ngpu} \
---config ${config_path} \
---decode_cfg ${decode_config_path} \
---result_file ${ckpt_prefix}.rsl \
---checkpoint_path ${ckpt_prefix} \
---model_type ${model_type}
-
-if [ $? -ne 0 ]; then
- echo "Failed in evaluation!"
- exit 1
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/librispeech/path.sh b/examples/other/1xt2x/librispeech/path.sh
deleted file mode 100644
index e3696ddd56644d19d095167478924a150aacc983..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/librispeech/path.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-export MAIN_ROOT=`realpath ${PWD}/../../../../`
-export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-export PYTHONPATH=${LOCAL_DEEPSPEECH2}:${PYTHONPATH}
-
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
-
-MODEL=deepspeech2
-export BIN_DIR=${LOCAL_DEEPSPEECH2}/src_deepspeech2x/bin
diff --git a/examples/other/1xt2x/librispeech/run.sh b/examples/other/1xt2x/librispeech/run.sh
deleted file mode 100755
index 8b614bbbfc8841cd6c60a4dcf5331a97349ce146..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/librispeech/run.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-set -e
-source path.sh
-
-stage=0
-stop_stage=100
-conf_path=conf/deepspeech2.yaml
-decode_conf_path=conf/tuning/decode.yaml
-avg_num=1
-model_type=offline
-gpus=1
-
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
-
-v18_ckpt=librispeech_v1.8
-ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
-echo "checkpoint name ${ckpt}"
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- # prepare data
- mkdir -p exp/${ckpt}/checkpoints
- bash ./local/data.sh exp/${ckpt}/checkpoints || exit -1
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # test ckpt avg_n
- CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
-fi
diff --git a/examples/other/1xt2x/src_deepspeech2x/__init__.py b/examples/other/1xt2x/src_deepspeech2x/__init__.py
deleted file mode 100644
index 74be4a2543632a2acbf9d4028762eb0a7e5f44e8..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/src_deepspeech2x/__init__.py
+++ /dev/null
@@ -1,370 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any
-from typing import List
-from typing import Tuple
-from typing import Union
-
-import paddle
-from paddle import nn
-from paddle.fluid import core
-from paddle.nn import functional as F
-
-from paddlespeech.s2t.utils.log import Log
-
-#TODO(Hui Zhang): remove fluid import
-logger = Log(__name__).getlog()
-
-########### hack logging #############
-logger.warn = logger.warning
-
-########### hack paddle #############
-paddle.half = 'float16'
-paddle.float = 'float32'
-paddle.double = 'float64'
-paddle.short = 'int16'
-paddle.int = 'int32'
-paddle.long = 'int64'
-paddle.uint16 = 'uint16'
-paddle.cdouble = 'complex128'
-
-
-def convert_dtype_to_string(tensor_dtype):
- """
- Convert the data type in numpy to the data type in Paddle
- Args:
- tensor_dtype(core.VarDesc.VarType): the data type in numpy.
- Returns:
- core.VarDesc.VarType: the data type in Paddle.
- """
- dtype = tensor_dtype
- if dtype == core.VarDesc.VarType.FP32:
- return paddle.float32
- elif dtype == core.VarDesc.VarType.FP64:
- return paddle.float64
- elif dtype == core.VarDesc.VarType.FP16:
- return paddle.float16
- elif dtype == core.VarDesc.VarType.INT32:
- return paddle.int32
- elif dtype == core.VarDesc.VarType.INT16:
- return paddle.int16
- elif dtype == core.VarDesc.VarType.INT64:
- return paddle.int64
- elif dtype == core.VarDesc.VarType.BOOL:
- return paddle.bool
- elif dtype == core.VarDesc.VarType.BF16:
- # since there is still no support for bfloat16 in NumPy,
- # uint16 is used for casting bfloat16
- return paddle.uint16
- elif dtype == core.VarDesc.VarType.UINT8:
- return paddle.uint8
- elif dtype == core.VarDesc.VarType.INT8:
- return paddle.int8
- elif dtype == core.VarDesc.VarType.COMPLEX64:
- return paddle.complex64
- elif dtype == core.VarDesc.VarType.COMPLEX128:
- return paddle.complex128
- else:
- raise ValueError("Not supported tensor dtype %s" % dtype)
-
-
-if not hasattr(paddle, 'softmax'):
- logger.warn("register user softmax to paddle, remove this when fixed!")
- setattr(paddle, 'softmax', paddle.nn.functional.softmax)
-
-if not hasattr(paddle, 'log_softmax'):
- logger.warn("register user log_softmax to paddle, remove this when fixed!")
- setattr(paddle, 'log_softmax', paddle.nn.functional.log_softmax)
-
-if not hasattr(paddle, 'sigmoid'):
- logger.warn("register user sigmoid to paddle, remove this when fixed!")
- setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid)
-
-if not hasattr(paddle, 'log_sigmoid'):
- logger.warn("register user log_sigmoid to paddle, remove this when fixed!")
- setattr(paddle, 'log_sigmoid', paddle.nn.functional.log_sigmoid)
-
-if not hasattr(paddle, 'relu'):
- logger.warn("register user relu to paddle, remove this when fixed!")
- setattr(paddle, 'relu', paddle.nn.functional.relu)
-
-
-def cat(xs, dim=0):
- return paddle.concat(xs, axis=dim)
-
-
-if not hasattr(paddle, 'cat'):
- logger.warn(
- "override cat of paddle if exists or register, remove this when fixed!")
- paddle.cat = cat
-
-
-########### hack paddle.Tensor #############
-def item(x: paddle.Tensor):
- return x.numpy().item()
-
-
-if not hasattr(paddle.Tensor, 'item'):
- logger.warn(
- "override item of paddle.Tensor if exists or register, remove this when fixed!"
- )
- paddle.Tensor.item = item
-
-
-def func_long(x: paddle.Tensor):
- return paddle.cast(x, paddle.long)
-
-
-if not hasattr(paddle.Tensor, 'long'):
- logger.warn(
- "override long of paddle.Tensor if exists or register, remove this when fixed!"
- )
- paddle.Tensor.long = func_long
-
-if not hasattr(paddle.Tensor, 'numel'):
- logger.warn(
- "override numel of paddle.Tensor if exists or register, remove this when fixed!"
- )
- paddle.Tensor.numel = paddle.numel
-
-
-def new_full(x: paddle.Tensor,
- size: Union[List[int], Tuple[int], paddle.Tensor],
- fill_value: Union[float, int, bool, paddle.Tensor],
- dtype=None):
- return paddle.full(size, fill_value, dtype=x.dtype)
-
-
-if not hasattr(paddle.Tensor, 'new_full'):
- logger.warn(
- "override new_full of paddle.Tensor if exists or register, remove this when fixed!"
- )
- paddle.Tensor.new_full = new_full
-
-
-def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:
- if convert_dtype_to_string(xs.dtype) == paddle.bool:
- xs = xs.astype(paddle.int)
- return xs.equal(
- paddle.to_tensor(
- ys, dtype=convert_dtype_to_string(xs.dtype), place=xs.place))
-
-
-if not hasattr(paddle.Tensor, 'eq'):
- logger.warn(
- "override eq of paddle.Tensor if exists or register, remove this when fixed!"
- )
- paddle.Tensor.eq = eq
-
-if not hasattr(paddle, 'eq'):
- logger.warn(
- "override eq of paddle if exists or register, remove this when fixed!")
- paddle.eq = eq
-
-
-def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
- return xs
-
-
-if not hasattr(paddle.Tensor, 'contiguous'):
- logger.warn(
- "override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
- )
- paddle.Tensor.contiguous = contiguous
-
-
-def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
- nargs = len(args)
- assert (nargs <= 1)
- s = paddle.shape(xs)
- if nargs == 1:
- return s[args[0]]
- else:
- return s
-
-
-#`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
-logger.warn(
- "override size of paddle.Tensor "
- "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
-)
-paddle.Tensor.size = size
-
-
-def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
- return xs.reshape(args)
-
-
-if not hasattr(paddle.Tensor, 'view'):
- logger.warn("register user view to paddle.Tensor, remove this when fixed!")
- paddle.Tensor.view = view
-
-
-def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
- return xs.reshape(ys.size())
-
-
-if not hasattr(paddle.Tensor, 'view_as'):
- logger.warn(
- "register user view_as to paddle.Tensor, remove this when fixed!")
- paddle.Tensor.view_as = view_as
-
-
-def is_broadcastable(shp1, shp2):
- for a, b in zip(shp1[::-1], shp2[::-1]):
- if a == 1 or b == 1 or a == b:
- pass
- else:
- return False
- return True
-
-
-def masked_fill(xs: paddle.Tensor,
- mask: paddle.Tensor,
- value: Union[float, int]):
- assert is_broadcastable(xs.shape, mask.shape) is True
- bshape = paddle.broadcast_shape(xs.shape, mask.shape)
- mask = mask.broadcast_to(bshape)
- trues = paddle.ones_like(xs) * value
- xs = paddle.where(mask, trues, xs)
- return xs
-
-
-if not hasattr(paddle.Tensor, 'masked_fill'):
- logger.warn(
- "register user masked_fill to paddle.Tensor, remove this when fixed!")
- paddle.Tensor.masked_fill = masked_fill
-
-
-def masked_fill_(xs: paddle.Tensor,
- mask: paddle.Tensor,
- value: Union[float, int]) -> paddle.Tensor:
- assert is_broadcastable(xs.shape, mask.shape) is True
- bshape = paddle.broadcast_shape(xs.shape, mask.shape)
- mask = mask.broadcast_to(bshape)
- trues = paddle.ones_like(xs) * value
- ret = paddle.where(mask, trues, xs)
- paddle.assign(ret.detach(), output=xs)
- return xs
-
-
-if not hasattr(paddle.Tensor, 'masked_fill_'):
- logger.warn(
- "register user masked_fill_ to paddle.Tensor, remove this when fixed!")
- paddle.Tensor.masked_fill_ = masked_fill_
-
-
-def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:
- val = paddle.full_like(xs, value)
- paddle.assign(val.detach(), output=xs)
- return xs
-
-
-if not hasattr(paddle.Tensor, 'fill_'):
- logger.warn("register user fill_ to paddle.Tensor, remove this when fixed!")
- paddle.Tensor.fill_ = fill_
-
-
-def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor:
- return paddle.tile(xs, size)
-
-
-if not hasattr(paddle.Tensor, 'repeat'):
- logger.warn(
- "register user repeat to paddle.Tensor, remove this when fixed!")
- paddle.Tensor.repeat = repeat
-
-if not hasattr(paddle.Tensor, 'softmax'):
- logger.warn(
- "register user softmax to paddle.Tensor, remove this when fixed!")
- setattr(paddle.Tensor, 'softmax', paddle.nn.functional.softmax)
-
-if not hasattr(paddle.Tensor, 'sigmoid'):
- logger.warn(
- "register user sigmoid to paddle.Tensor, remove this when fixed!")
- setattr(paddle.Tensor, 'sigmoid', paddle.nn.functional.sigmoid)
-
-if not hasattr(paddle.Tensor, 'relu'):
- logger.warn("register user relu to paddle.Tensor, remove this when fixed!")
- setattr(paddle.Tensor, 'relu', paddle.nn.functional.relu)
-
-
-def type_as(x: paddle.Tensor, other: paddle.Tensor) -> paddle.Tensor:
- return x.astype(other.dtype)
-
-
-if not hasattr(paddle.Tensor, 'type_as'):
- logger.warn(
- "register user type_as to paddle.Tensor, remove this when fixed!")
- setattr(paddle.Tensor, 'type_as', type_as)
-
-
-def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
- assert len(args) == 1
- if isinstance(args[0], str): # dtype
- return x.astype(args[0])
- elif isinstance(args[0], paddle.Tensor): #Tensor
- return x.astype(args[0].dtype)
- else: # Device
- return x
-
-
-if not hasattr(paddle.Tensor, 'to'):
- logger.warn("register user to to paddle.Tensor, remove this when fixed!")
- setattr(paddle.Tensor, 'to', to)
-
-
-def func_float(x: paddle.Tensor) -> paddle.Tensor:
- return x.astype(paddle.float)
-
-
-if not hasattr(paddle.Tensor, 'float'):
- logger.warn("register user float to paddle.Tensor, remove this when fixed!")
- setattr(paddle.Tensor, 'float', func_float)
-
-
-def func_int(x: paddle.Tensor) -> paddle.Tensor:
- return x.astype(paddle.int)
-
-
-if not hasattr(paddle.Tensor, 'int'):
- logger.warn("register user int to paddle.Tensor, remove this when fixed!")
- setattr(paddle.Tensor, 'int', func_int)
-
-
-def tolist(x: paddle.Tensor) -> List[Any]:
- return x.numpy().tolist()
-
-
-if not hasattr(paddle.Tensor, 'tolist'):
- logger.warn(
- "register user tolist to paddle.Tensor, remove this when fixed!")
- setattr(paddle.Tensor, 'tolist', tolist)
-
-
-########### hack paddle.nn #############
-class GLU(nn.Layer):
- """Gated Linear Units (GLU) Layer"""
-
- def __init__(self, dim: int=-1):
- super().__init__()
- self.dim = dim
-
- def forward(self, xs):
- return F.glu(xs, axis=self.dim)
-
-
-if not hasattr(paddle.nn, 'GLU'):
- logger.warn("register user GLU to paddle.nn, remove this when fixed!")
- setattr(paddle.nn, 'GLU', GLU)
diff --git a/examples/other/1xt2x/src_deepspeech2x/bin/test.py b/examples/other/1xt2x/src_deepspeech2x/bin/test.py
deleted file mode 100644
index 88a13fdca31276f9d9aeeeb0bf0d42e7bec4d4c3..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/src_deepspeech2x/bin/test.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Evaluation for DeepSpeech2 model."""
-from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester
-from yacs.config import CfgNode
-
-from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
-
-
-def main_sp(config, args):
- exp = Tester(config, args)
- exp.setup()
- exp.run_test()
-
-
-def main(config, args):
- main_sp(config, args)
-
-
-if __name__ == "__main__":
- parser = default_argument_parser()
- parser.add_argument(
- "--model_type", type=str, default='offline', help='offline/online')
- # save asr result to
- parser.add_argument(
- "--result_file", type=str, help="path of save the asr result")
- args = parser.parse_args()
- print_arguments(args, globals())
- print("model_type:{}".format(args.model_type))
-
- # https://yaml.org/type/float.html
- config = CfgNode(new_allowed=True)
- if args.config:
- config.merge_from_file(args.config)
- if args.decode_cfg:
- decode_confs = CfgNode(new_allowed=True)
- decode_confs.merge_from_file(args.decode_cfg)
- config.decode = decode_confs
- if args.opts:
- config.merge_from_list(args.opts)
- config.freeze()
- print(config)
- if args.dump_config:
- with open(args.dump_config, 'w') as f:
- print(config, file=f)
-
- main(config, args)
diff --git a/examples/other/1xt2x/src_deepspeech2x/models/__init__.py b/examples/other/1xt2x/src_deepspeech2x/models/__init__.py
deleted file mode 100644
index 185a92b8d94d3426d616c0624f0f2ee04339349e..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/src_deepspeech2x/models/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/__init__.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/__init__.py
deleted file mode 100644
index 39bea5bf9da14bd4ebd89518dd68789534cfd266..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .deepspeech2 import DeepSpeech2InferModel
-from .deepspeech2 import DeepSpeech2Model
-
-__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']
diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
deleted file mode 100644
index f6e185ff1dd8f461635c1c64c941893aed4984d3..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Deepspeech2 ASR Model"""
-import paddle
-from paddle import nn
-from src_deepspeech2x.models.ds2.rnn import RNNStack
-
-from paddlespeech.s2t.models.ds2.conv import ConvStack
-from paddlespeech.s2t.modules.ctc import CTCDecoder
-from paddlespeech.s2t.utils import layer_tools
-from paddlespeech.s2t.utils.checkpoint import Checkpoint
-from paddlespeech.s2t.utils.log import Log
-logger = Log(__name__).getlog()
-
-__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']
-
-
-class CRNNEncoder(nn.Layer):
- def __init__(self,
- feat_size,
- dict_size,
- num_conv_layers=2,
- num_rnn_layers=3,
- rnn_size=1024,
- use_gru=False,
- share_rnn_weights=True):
- super().__init__()
- self.rnn_size = rnn_size
- self.feat_size = feat_size # 161 for linear
- self.dict_size = dict_size
-
- self.conv = ConvStack(feat_size, num_conv_layers)
-
- i_size = self.conv.output_height # H after conv stack
- self.rnn = RNNStack(
- i_size=i_size,
- h_size=rnn_size,
- num_stacks=num_rnn_layers,
- use_gru=use_gru,
- share_rnn_weights=share_rnn_weights)
-
- @property
- def output_size(self):
- return self.rnn_size * 2
-
- def forward(self, audio, audio_len):
- """Compute Encoder outputs
-
- Args:
- audio (Tensor): [B, Tmax, D]
- text (Tensor): [B, Umax]
- audio_len (Tensor): [B]
- text_len (Tensor): [B]
- Returns:
- x (Tensor): encoder outputs, [B, T, D]
- x_lens (Tensor): encoder length, [B]
- """
- # [B, T, D] -> [B, D, T]
- audio = audio.transpose([0, 2, 1])
- # [B, D, T] -> [B, C=1, D, T]
- x = audio.unsqueeze(1)
- x_lens = audio_len
-
- # convolution group
- x, x_lens = self.conv(x, x_lens)
- x_val = x.numpy()
-
- # convert data from convolution feature map to sequence of vectors
- #B, C, D, T = paddle.shape(x) # not work under jit
- x = x.transpose([0, 3, 1, 2]) #[B, T, C, D]
- #x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit
- x = x.reshape([0, 0, -1]) #[B, T, C*D]
-
- # remove padding part
- x, x_lens = self.rnn(x, x_lens) #[B, T, D]
- return x, x_lens
-
-
-class DeepSpeech2Model(nn.Layer):
- """The DeepSpeech2 network structure.
-
- :param audio_data: Audio spectrogram data layer.
- :type audio_data: Variable
- :param text_data: Transcription text data layer.
- :type text_data: Variable
- :param audio_len: Valid sequence length data layer.
- :type audio_len: Variable
- :param masks: Masks data layer to reset padding.
- :type masks: Variable
- :param dict_size: Dictionary size for tokenized transcription.
- :type dict_size: int
- :param num_conv_layers: Number of stacking convolution layers.
- :type num_conv_layers: int
- :param num_rnn_layers: Number of stacking RNN layers.
- :type num_rnn_layers: int
- :param rnn_size: RNN layer size (dimension of RNN cells).
- :type rnn_size: int
- :param use_gru: Use gru if set True. Use simple rnn if set False.
- :type use_gru: bool
- :param share_rnn_weights: Whether to share input-hidden weights between
- forward and backward direction RNNs.
- It is only available when use_gru=False.
- :type share_weights: bool
- :return: A tuple of an output unnormalized log probability layer (
- before softmax) and a ctc cost layer.
- :rtype: tuple of LayerOutput
- """
-
- def __init__(self,
- feat_size,
- dict_size,
- num_conv_layers=2,
- num_rnn_layers=3,
- rnn_size=1024,
- use_gru=False,
- share_rnn_weights=True,
- blank_id=0):
- super().__init__()
- self.encoder = CRNNEncoder(
- feat_size=feat_size,
- dict_size=dict_size,
- num_conv_layers=num_conv_layers,
- num_rnn_layers=num_rnn_layers,
- rnn_size=rnn_size,
- use_gru=use_gru,
- share_rnn_weights=share_rnn_weights)
- assert (self.encoder.output_size == rnn_size * 2)
-
- self.decoder = CTCDecoder(
- odim=dict_size, # is in vocab
- enc_n_units=self.encoder.output_size,
- blank_id=blank_id, # first token is
- dropout_rate=0.0,
- reduction=True, # sum
- batch_average=True) # sum / batch_size
-
- def forward(self, audio, audio_len, text, text_len):
- """Compute Model loss
-
- Args:
- audio (Tensor): [B, T, D]
- audio_len (Tensor): [B]
- text (Tensor): [B, U]
- text_len (Tensor): [B]
-
- Returns:
- loss (Tensor): [1]
- """
- eouts, eouts_len = self.encoder(audio, audio_len)
- loss = self.decoder(eouts, eouts_len, text, text_len)
- return loss
-
- @paddle.no_grad()
- def decode(self, audio, audio_len):
- # decoders only accept string encoded in utf-8
-
- # Make sure the decoder has been initialized
- eouts, eouts_len = self.encoder(audio, audio_len)
- probs = self.decoder.softmax(eouts)
- batch_size = probs.shape[0]
- self.decoder.reset_decoder(batch_size=batch_size)
- self.decoder.next(probs, eouts_len)
- trans_best, trans_beam = self.decoder.decode()
- return trans_best
-
- @classmethod
- def from_pretrained(cls, dataloader, config, checkpoint_path):
- """Build a DeepSpeech2Model model from a pretrained model.
- Parameters
- ----------
- dataloader: paddle.io.DataLoader
-
- config: yacs.config.CfgNode
- model configs
-
- checkpoint_path: Path or str
- the path of pretrained model checkpoint, without extension name
-
- Returns
- -------
- DeepSpeech2Model
- The model built from pretrained result.
- """
- model = cls(feat_size=dataloader.collate_fn.feature_size,
- dict_size=len(dataloader.collate_fn.vocab_list),
- num_conv_layers=config.num_conv_layers,
- num_rnn_layers=config.num_rnn_layers,
- rnn_size=config.rnn_layer_size,
- use_gru=config.use_gru,
- share_rnn_weights=config.share_rnn_weights)
- infos = Checkpoint().load_parameters(
- model, checkpoint_path=checkpoint_path)
- logger.info(f"checkpoint info: {infos}")
- layer_tools.summary(model)
- return model
-
- @classmethod
- def from_config(cls, config):
- """Build a DeepSpeec2Model from config
- Parameters
-
- config: yacs.config.CfgNode
- config
- Returns
- -------
- DeepSpeech2Model
- The model built from config.
- """
- model = cls(feat_size=config.feat_size,
- dict_size=config.dict_size,
- num_conv_layers=config.num_conv_layers,
- num_rnn_layers=config.num_rnn_layers,
- rnn_size=config.rnn_layer_size,
- use_gru=config.use_gru,
- share_rnn_weights=config.share_rnn_weights,
- blank_id=config.blank_id)
- return model
-
-
-class DeepSpeech2InferModel(DeepSpeech2Model):
- def __init__(self,
- feat_size,
- dict_size,
- num_conv_layers=2,
- num_rnn_layers=3,
- rnn_size=1024,
- use_gru=False,
- share_rnn_weights=True,
- blank_id=0):
- super().__init__(
- feat_size=feat_size,
- dict_size=dict_size,
- num_conv_layers=num_conv_layers,
- num_rnn_layers=num_rnn_layers,
- rnn_size=rnn_size,
- use_gru=use_gru,
- share_rnn_weights=share_rnn_weights,
- blank_id=blank_id)
-
- def forward(self, audio, audio_len):
- """export model function
-
- Args:
- audio (Tensor): [B, T, D]
- audio_len (Tensor): [B]
-
- Returns:
- probs: probs after softmax
- """
- eouts, eouts_len = self.encoder(audio, audio_len)
- probs = self.decoder.softmax(eouts)
- return probs, eouts_len
-
- def export(self):
- static_model = paddle.jit.to_static(
- self,
- input_spec=[
- paddle.static.InputSpec(
- shape=[None, None, self.encoder.feat_size],
- dtype='float32'), # audio, [B,T,D]
- paddle.static.InputSpec(shape=[None],
- dtype='int64'), # audio_length, [B]
- ])
- return static_model
diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/rnn.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/rnn.py
deleted file mode 100644
index 383a07467027194062108733d780a77f0483155d..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/rnn.py
+++ /dev/null
@@ -1,334 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-from paddle.nn import initializer as I
-
-from paddlespeech.s2t.modules.activation import brelu
-from paddlespeech.s2t.modules.mask import make_non_pad_mask
-from paddlespeech.s2t.utils.log import Log
-logger = Log(__name__).getlog()
-
-__all__ = ['RNNStack']
-
-
-class RNNCell(nn.RNNCellBase):
- r"""
- Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
- computes the outputs and updates states.
- The formula used is as follows:
- .. math::
- h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
- y_{t} & = h_{t}
-
- where :math:`act` is for :attr:`activation`.
- """
-
- def __init__(self,
- hidden_size: int,
- activation="tanh",
- weight_ih_attr=None,
- weight_hh_attr=None,
- bias_ih_attr=None,
- bias_hh_attr=None,
- name=None):
- super().__init__()
- std = 1.0 / math.sqrt(hidden_size)
- self.weight_hh = self.create_parameter(
- (hidden_size, hidden_size),
- weight_hh_attr,
- default_initializer=I.Uniform(-std, std))
- self.bias_ih = None
- self.bias_hh = self.create_parameter(
- (hidden_size, ),
- bias_hh_attr,
- is_bias=True,
- default_initializer=I.Uniform(-std, std))
-
- self.hidden_size = hidden_size
- if activation not in ["tanh", "relu", "brelu"]:
- raise ValueError(
- "activation for SimpleRNNCell should be tanh or relu, "
- "but get {}".format(activation))
- self.activation = activation
- self._activation_fn = paddle.tanh \
- if activation == "tanh" \
- else F.relu
- if activation == 'brelu':
- self._activation_fn = brelu
-
- def forward(self, inputs, states=None):
- if states is None:
- states = self.get_initial_states(inputs, self.state_shape)
- pre_h = states
- i2h = inputs
- if self.bias_ih is not None:
- i2h += self.bias_ih
- h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
- if self.bias_hh is not None:
- h2h += self.bias_hh
- h = self._activation_fn(i2h + h2h)
- return h, h
-
- @property
- def state_shape(self):
- return (self.hidden_size, )
-
-
-class GRUCell(nn.RNNCellBase):
- r"""
- Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
- it computes the outputs and updates states.
- The formula for GRU used is as follows:
- .. math::
- r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
- z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
- \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
- h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
- y_{t} & = h_{t}
-
- where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
- multiplication operator.
- """
-
- def __init__(self,
- input_size: int,
- hidden_size: int,
- weight_ih_attr=None,
- weight_hh_attr=None,
- bias_ih_attr=None,
- bias_hh_attr=None,
- name=None):
- super().__init__()
- std = 1.0 / math.sqrt(hidden_size)
- self.weight_hh = self.create_parameter(
- (3 * hidden_size, hidden_size),
- weight_hh_attr,
- default_initializer=I.Uniform(-std, std))
- self.bias_ih = None
- self.bias_hh = self.create_parameter(
- (3 * hidden_size, ),
- bias_hh_attr,
- is_bias=True,
- default_initializer=I.Uniform(-std, std))
-
- self.hidden_size = hidden_size
- self.input_size = input_size
- self._gate_activation = F.sigmoid
- self._activation = paddle.relu
-
- def forward(self, inputs, states=None):
- if states is None:
- states = self.get_initial_states(inputs, self.state_shape)
-
- pre_hidden = states # shape [batch_size, hidden_size]
-
- x_gates = inputs
- if self.bias_ih is not None:
- x_gates = x_gates + self.bias_ih
- bias_u, bias_r, bias_c = paddle.split(
- self.bias_hh, num_or_sections=3, axis=0)
-
- weight_hh = paddle.transpose(
- self.weight_hh,
- perm=[1, 0]) #weight_hh:shape[hidden_size, 3 * hidden_size]
- w_u_r_c = paddle.flatten(weight_hh)
- size_u_r = self.hidden_size * 2 * self.hidden_size
- w_u_r = paddle.reshape(w_u_r_c[:size_u_r],
- (self.hidden_size, self.hidden_size * 2))
- w_u, w_r = paddle.split(w_u_r, num_or_sections=2, axis=1)
- w_c = paddle.reshape(w_u_r_c[size_u_r:],
- (self.hidden_size, self.hidden_size))
-
- h_u = paddle.matmul(
- pre_hidden, w_u,
- transpose_y=False) + bias_u #shape [batch_size, hidden_size]
- h_r = paddle.matmul(
- pre_hidden, w_r,
- transpose_y=False) + bias_r #shape [batch_size, hidden_size]
-
- x_u, x_r, x_c = paddle.split(
- x_gates, num_or_sections=3, axis=1) #shape[batch_size, hidden_size]
-
- u = self._gate_activation(x_u + h_u) #shape [batch_size, hidden_size]
- r = self._gate_activation(x_r + h_r) #shape [batch_size, hidden_size]
- c = self._activation(
- x_c + paddle.matmul(r * pre_hidden, w_c, transpose_y=False) +
- bias_c) # [batch_size, hidden_size]
-
- h = (1 - u) * pre_hidden + u * c
- # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
- return h, h
-
- @property
- def state_shape(self):
- r"""
- The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
- size would be automatically inserted into shape). The shape corresponds
- to the shape of :math:`h_{t-1}`.
- """
- return (self.hidden_size, )
-
-
-class BiRNNWithBN(nn.Layer):
- """Bidirectonal simple rnn layer with sequence-wise batch normalization.
- The batch normalization is only performed on input-state weights.
-
- :param size: Dimension of RNN cells.
- :type size: int
- :param share_weights: Whether to share input-hidden weights between
- forward and backward directional RNNs.
- :type share_weights: bool
- :return: Bidirectional simple rnn layer.
- :rtype: Variable
- """
-
- def __init__(self, i_size: int, h_size: int, share_weights: bool):
- super().__init__()
- self.share_weights = share_weights
- if self.share_weights:
- #input-hidden weights shared between bi-directional rnn.
- self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
- # batch norm is only performed on input-state projection
- self.fw_bn = nn.BatchNorm1D(
- h_size, bias_attr=None, data_format='NLC')
- self.bw_fc = self.fw_fc
- self.bw_bn = self.fw_bn
- else:
- self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
- self.fw_bn = nn.BatchNorm1D(
- h_size, bias_attr=None, data_format='NLC')
- self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)
- self.bw_bn = nn.BatchNorm1D(
- h_size, bias_attr=None, data_format='NLC')
-
- self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')
- self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')
- self.fw_rnn = nn.RNN(
- self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
- self.bw_rnn = nn.RNN(
- self.bw_cell, is_reverse=True, time_major=False) #[B, T, D]
-
- def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
- # x, shape [B, T, D]
- fw_x = self.fw_bn(self.fw_fc(x))
- bw_x = self.bw_bn(self.bw_fc(x))
- fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
- bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
- x = paddle.concat([fw_x, bw_x], axis=-1)
- return x, x_len
-
-
-class BiGRUWithBN(nn.Layer):
- """Bidirectonal gru layer with sequence-wise batch normalization.
- The batch normalization is only performed on input-state weights.
-
- :param name: Name of the layer.
- :type name: string
- :param input: Input layer.
- :type input: Variable
- :param size: Dimension of GRU cells.
- :type size: int
- :param act: Activation type.
- :type act: string
- :return: Bidirectional GRU layer.
- :rtype: Variable
- """
-
- def __init__(self, i_size: int, h_size: int):
- super().__init__()
- hidden_size = h_size * 3
-
- self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
- self.fw_bn = nn.BatchNorm1D(
- hidden_size, bias_attr=None, data_format='NLC')
- self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
- self.bw_bn = nn.BatchNorm1D(
- hidden_size, bias_attr=None, data_format='NLC')
-
- self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
- self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
- self.fw_rnn = nn.RNN(
- self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
- self.bw_rnn = nn.RNN(
- self.bw_cell, is_reverse=True, time_major=False) #[B, T, D]
-
- def forward(self, x, x_len):
- # x, shape [B, T, D]
- fw_x = self.fw_bn(self.fw_fc(x))
-
- bw_x = self.bw_bn(self.bw_fc(x))
- fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
- bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
- x = paddle.concat([fw_x, bw_x], axis=-1)
- return x, x_len
-
-
-class RNNStack(nn.Layer):
- """RNN group with stacked bidirectional simple RNN or GRU layers.
-
- :param input: Input layer.
- :type input: Variable
- :param size: Dimension of RNN cells in each layer.
- :type size: int
- :param num_stacks: Number of stacked rnn layers.
- :type num_stacks: int
- :param use_gru: Use gru if set True. Use simple rnn if set False.
- :type use_gru: bool
- :param share_rnn_weights: Whether to share input-hidden weights between
- forward and backward directional RNNs.
- It is only available when use_gru=False.
- :type share_weights: bool
- :return: Output layer of the RNN group.
- :rtype: Variable
- """
-
- def __init__(self,
- i_size: int,
- h_size: int,
- num_stacks: int,
- use_gru: bool,
- share_rnn_weights: bool):
- super().__init__()
- rnn_stacks = []
- for i in range(num_stacks):
- if use_gru:
- #default:GRU using tanh
- rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size))
- else:
- rnn_stacks.append(
- BiRNNWithBN(
- i_size=i_size,
- h_size=h_size,
- share_weights=share_rnn_weights))
- i_size = h_size * 2
-
- self.rnn_stacks = nn.LayerList(rnn_stacks)
-
- def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
- """
- x: shape [B, T, D]
- x_len: shpae [B]
- """
- for i, rnn in enumerate(self.rnn_stacks):
- x, x_len = rnn(x, x_len)
- masks = make_non_pad_mask(x_len) #[B, T]
- masks = masks.unsqueeze(-1) # [B, T, 1]
- # TODO(Hui Zhang): not support bool multiply
- masks = masks.astype(x.dtype)
- x = x.multiply(masks)
- return x, x_len
diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py
deleted file mode 100644
index 11b85442da50ef15a821a0d08b473191ca695a4d..0000000000000000000000000000000000000000
--- a/examples/other/1xt2x/src_deepspeech2x/test_model.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Contains DeepSpeech2 and DeepSpeech2Online model."""
-import time
-from collections import defaultdict
-from contextlib import nullcontext
-
-import numpy as np
-import paddle
-from paddle import distributed as dist
-from paddle.io import DataLoader
-from src_deepspeech2x.models.ds2 import DeepSpeech2InferModel
-from src_deepspeech2x.models.ds2 import DeepSpeech2Model
-
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.io.collator import SpeechCollator
-from paddlespeech.s2t.io.dataset import ManifestDataset
-from paddlespeech.s2t.io.sampler import SortagradBatchSampler
-from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
-from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline
-from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
-from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
-from paddlespeech.s2t.training.trainer import Trainer
-from paddlespeech.s2t.utils import error_rate
-from paddlespeech.s2t.utils import layer_tools
-from paddlespeech.s2t.utils import mp_tools
-from paddlespeech.s2t.utils.log import Log
-
-logger = Log(__name__).getlog()
-
-
-class DeepSpeech2Trainer(Trainer):
- def __init__(self, config, args):
- super().__init__(config, args)
-
- def train_batch(self, batch_index, batch_data, msg):
- train_conf = self.config
- start = time.time()
-
- # forward
- utt, audio, audio_len, text, text_len = batch_data
- loss = self.model(audio, audio_len, text, text_len)
- losses_np = {
- 'train_loss': float(loss),
- }
-
- # loss backward
- if (batch_index + 1) % train_conf.accum_grad != 0:
- # Disable gradient synchronizations across DDP processes.
- # Within this context, gradients will be accumulated on module
- # variables, which will later be synchronized.
- context = self.model.no_sync
- else:
- # Used for single gpu training and DDP gradient synchronization
- # processes.
- context = nullcontext
-
- with context():
- loss.backward()
- layer_tools.print_grads(self.model, print_func=None)
-
- # optimizer step
- if (batch_index + 1) % train_conf.accum_grad == 0:
- self.optimizer.step()
- self.optimizer.clear_grad()
- self.iteration += 1
-
- iteration_time = time.time() - start
-
- msg += "train time: {:>.3f}s, ".format(iteration_time)
- msg += "batch size: {}, ".format(self.config.batch_size)
- msg += "accum: {}, ".format(train_conf.accum_grad)
- msg += ', '.join('{}: {:>.6f}'.format(k, v)
- for k, v in losses_np.items())
- logger.info(msg)
-
- if dist.get_rank() == 0 and self.visualizer:
- for k, v in losses_np.items():
- # `step -1` since we update `step` after optimizer.step().
- self.visualizer.add_scalar("train/{}".format(k), v,
- self.iteration - 1)
-
- @paddle.no_grad()
- def valid(self):
- logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
- self.model.eval()
- valid_losses = defaultdict(list)
- num_seen_utts = 1
- total_loss = 0.0
- for i, batch in enumerate(self.valid_loader):
- utt, audio, audio_len, text, text_len = batch
- loss = self.model(audio, audio_len, text, text_len)
- if paddle.isfinite(loss):
- num_utts = batch[1].shape[0]
- num_seen_utts += num_utts
- total_loss += float(loss) * num_utts
- valid_losses['val_loss'].append(float(loss))
-
- if (i + 1) % self.config.log_interval == 0:
- valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
- valid_dump['val_history_loss'] = total_loss / num_seen_utts
-
- # logging
- msg = f"Valid: Rank: {dist.get_rank()}, "
- msg += "epoch: {}, ".format(self.epoch)
- msg += "step: {}, ".format(self.iteration)
- msg += "batch : {}/{}, ".format(i + 1, len(self.valid_loader))
- msg += ', '.join('{}: {:>.6f}'.format(k, v)
- for k, v in valid_dump.items())
- logger.info(msg)
-
- logger.info('Rank {} Val info val_loss {}'.format(
- dist.get_rank(), total_loss / num_seen_utts))
- return total_loss, num_seen_utts
-
- def setup_model(self):
- config = self.config.clone()
- config.defrost()
- config.feat_size = self.train_loader.collate_fn.feature_size
- #config.dict_size = self.train_loader.collate_fn.vocab_size
- config.dict_size = len(self.train_loader.collate_fn.vocab_list)
- config.freeze()
-
- if self.args.model_type == 'offline':
- model = DeepSpeech2Model.from_config(config)
- elif self.args.model_type == 'online':
- model = DeepSpeech2ModelOnline.from_config(config)
- else:
- raise Exception("wrong model type")
- if self.parallel:
- model = paddle.DataParallel(model)
-
- logger.info(f"{model}")
- layer_tools.print_params(model, logger.info)
-
- grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip)
- lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
- learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
- optimizer = paddle.optimizer.Adam(
- learning_rate=lr_scheduler,
- parameters=model.parameters(),
- weight_decay=paddle.regularizer.L2Decay(config.weight_decay),
- grad_clip=grad_clip)
-
- self.model = model
- self.optimizer = optimizer
- self.lr_scheduler = lr_scheduler
- logger.info("Setup model/optimizer/lr_scheduler!")
-
- def setup_dataloader(self):
- config = self.config.clone()
- config.defrost()
- config.keep_transcription_text = False
-
- config.manifest = config.train_manifest
- train_dataset = ManifestDataset.from_config(config)
-
- config.manifest = config.dev_manifest
- dev_dataset = ManifestDataset.from_config(config)
-
- config.manifest = config.test_manifest
- test_dataset = ManifestDataset.from_config(config)
-
- if self.parallel:
- batch_sampler = SortagradDistributedBatchSampler(
- train_dataset,
- batch_size=config.batch_size,
- num_replicas=None,
- rank=None,
- shuffle=True,
- drop_last=True,
- sortagrad=config.sortagrad,
- shuffle_method=config.shuffle_method)
- else:
- batch_sampler = SortagradBatchSampler(
- train_dataset,
- shuffle=True,
- batch_size=config.batch_size,
- drop_last=True,
- sortagrad=config.sortagrad,
- shuffle_method=config.shuffle_method)
-
- collate_fn_train = SpeechCollator.from_config(config)
-
- config.augmentation_config = ""
- collate_fn_dev = SpeechCollator.from_config(config)
-
- config.keep_transcription_text = True
- config.augmentation_config = ""
- collate_fn_test = SpeechCollator.from_config(config)
-
- self.train_loader = DataLoader(
- train_dataset,
- batch_sampler=batch_sampler,
- collate_fn=collate_fn_train,
- num_workers=config.num_workers)
- self.valid_loader = DataLoader(
- dev_dataset,
- batch_size=config.batch_size,
- shuffle=False,
- drop_last=False,
- collate_fn=collate_fn_dev)
- self.test_loader = DataLoader(
- test_dataset,
- batch_size=config.decode.decode_batch_size,
- shuffle=False,
- drop_last=False,
- collate_fn=collate_fn_test)
- if "" in self.test_loader.collate_fn.vocab_list:
- self.test_loader.collate_fn.vocab_list.remove("")
- if "" in self.valid_loader.collate_fn.vocab_list:
- self.valid_loader.collate_fn.vocab_list.remove("")
- if "" in self.train_loader.collate_fn.vocab_list:
- self.train_loader.collate_fn.vocab_list.remove("")
- logger.info("Setup train/valid/test Dataloader!")
-
-
-class DeepSpeech2Tester(DeepSpeech2Trainer):
- def __init__(self, config, args):
-
- self._text_featurizer = TextFeaturizer(
- unit_type=config.unit_type, vocab=None)
- super().__init__(config, args)
-
- def ordid2token(self, texts, texts_len):
- """ ord() id to chr() chr """
- trans = []
- for text, n in zip(texts, texts_len):
- n = n.numpy().item()
- ids = text[:n]
- trans.append(''.join([chr(i) for i in ids]))
- return trans
-
- def compute_metrics(self,
- utts,
- audio,
- audio_len,
- texts,
- texts_len,
- fout=None):
- cfg = self.config.decode
- errors_sum, len_refs, num_ins = 0.0, 0, 0
- errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
- error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
-
- target_transcripts = self.ordid2token(texts, texts_len)
-
- result_transcripts = self.compute_result_transcripts(audio, audio_len)
-
- for utt, target, result in zip(utts, target_transcripts,
- result_transcripts):
- errors, len_ref = errors_func(target, result)
- errors_sum += errors
- len_refs += len_ref
- num_ins += 1
- if fout:
- fout.write(utt + " " + result + "\n")
- logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
- (target, result))
- logger.info("Current error rate [%s] = %f" %
- (cfg.error_rate_type, error_rate_func(target, result)))
-
- return dict(
- errors_sum=errors_sum,
- len_refs=len_refs,
- num_ins=num_ins,
- error_rate=errors_sum / len_refs,
- error_rate_type=cfg.error_rate_type)
-
- def compute_result_transcripts(self, audio, audio_len):
- result_transcripts = self.model.decode(audio, audio_len)
-
- result_transcripts = [
- self._text_featurizer.detokenize(item)
- for item in result_transcripts
- ]
- return result_transcripts
-
- @mp_tools.rank_zero_only
- @paddle.no_grad()
- def test(self):
- logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
- self.model.eval()
- cfg = self.config
- error_rate_type = None
- errors_sum, len_refs, num_ins = 0.0, 0, 0
-
- # Initialized the decoder in model
- decode_cfg = self.config.decode
- vocab_list = self.test_loader.collate_fn.vocab_list
- decode_batch_size = self.test_loader.batch_size
- self.model.decoder.init_decoder(
- decode_batch_size, vocab_list, decode_cfg.decoding_method,
- decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
- decode_cfg.beam_size, decode_cfg.cutoff_prob,
- decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch)
-
- with open(self.args.result_file, 'w') as fout:
- for i, batch in enumerate(self.test_loader):
- utts, audio, audio_len, texts, texts_len = batch
- metrics = self.compute_metrics(utts, audio, audio_len, texts,
- texts_len, fout)
- errors_sum += metrics['errors_sum']
- len_refs += metrics['len_refs']
- num_ins += metrics['num_ins']
- error_rate_type = metrics['error_rate_type']
- logger.info("Error rate [%s] (%d/?) = %f" %
- (error_rate_type, num_ins, errors_sum / len_refs))
-
- # logging
- msg = "Test: "
- msg += "epoch: {}, ".format(self.epoch)
- msg += "step: {}, ".format(self.iteration)
- msg += "Final error rate [%s] (%d/%d) = %f" % (
- error_rate_type, num_ins, num_ins, errors_sum / len_refs)
- logger.info(msg)
- self.model.decoder.del_decoder()
-
- def run_test(self):
- self.resume_or_scratch()
- try:
- self.test()
- except KeyboardInterrupt:
- exit(-1)
-
- def export(self):
- if self.args.model_type == 'offline':
- infer_model = DeepSpeech2InferModel.from_pretrained(
- self.test_loader, self.config, self.args.checkpoint_path)
- elif self.args.model_type == 'online':
- infer_model = DeepSpeech2InferModelOnline.from_pretrained(
- self.test_loader, self.config, self.args.checkpoint_path)
- else:
- raise Exception("wrong model type")
-
- infer_model.eval()
- feat_dim = self.test_loader.collate_fn.feature_size
- static_model = infer_model.export()
- logger.info(f"Export code: {static_model.forward.code}")
- paddle.jit.save(static_model, self.args.export_path)
-
- def run_export(self):
- try:
- self.export()
- except KeyboardInterrupt:
- exit(-1)
diff --git a/examples/other/mfa/local/reorganize_baker.py b/examples/other/mfa/local/reorganize_baker.py
index 8adad834ffc466bac20781af8ad37d4bcb2e4590..153e01d13f2ceea67f62d7b8e78a216c4dc3302e 100644
--- a/examples/other/mfa/local/reorganize_baker.py
+++ b/examples/other/mfa/local/reorganize_baker.py
@@ -42,9 +42,6 @@ def get_transcripts(path: Union[str, Path]):
for i in range(0, len(lines), 2):
sentence_id = lines[i].split()[0]
transcription = lines[i + 1].strip()
- # tones are dropped here
- # since the lexicon does not consider tones, too
- transcription = " ".join([item[:-1] for item in transcription.split()])
transcripts[sentence_id] = transcription
return transcripts
diff --git a/examples/other/mfa/run.sh b/examples/other/mfa/run.sh
old mode 100644
new mode 100755
index 1fef58b4ebcf8a662367a77895415bebf530992a..29dacc9b1397e0b014ed1feb059e4f63b0b088b2
--- a/examples/other/mfa/run.sh
+++ b/examples/other/mfa/run.sh
@@ -4,7 +4,7 @@ mkdir -p $EXP_DIR
LEXICON_NAME='simple'
if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then
echo "generating lexicon..."
- python local/generate_lexicon.py "$EXP_DIR/$LEXICON_NAME" --with-r
+ python local/generate_lexicon.py "$EXP_DIR/$LEXICON_NAME" --with-r --with-tone
echo "lexicon done"
fi
@@ -16,6 +16,7 @@ if [ ! -d $EXP_DIR/baker_corpus ]; then
echo "transcription for each audio file is saved with the same namd in $EXP_DIR/baker_corpus "
fi
+
echo "detecting oov..."
python local/detect_oov.py $EXP_DIR/baker_corpus $EXP_DIR/"$LEXICON_NAME.lexicon"
echo "detecting oov done. you may consider regenerate lexicon if there is unexpected OOVs."
@@ -44,6 +45,3 @@ if [ ! -d "$EXP_DIR/baker_alignment" ]; then
echo "model: $EXP_DIR/baker_model"
fi
-
-
-
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index 8864910878a8292d2afacdbd3442b9de101931ed..0b0ce09349dbf11e0823392e7ace9aeb9c1033cc 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -112,12 +112,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -126,11 +126,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -142,10 +141,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -161,12 +160,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -176,11 +175,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -191,10 +189,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -207,9 +205,9 @@ optional arguments:
output dir.
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index 45ba51013e1399a2f6916d0f1b7c0723e22618d2..a0e06a4206d8c214119b187164396fe9a0b1711b 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -70,7 +70,7 @@ Train a ParallelWaveGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG ParallelWaveGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md
index 514af4679a13242cd9b7cb4663e520805a1275e1..f2cbf27d21706d0702e46a20ff57aabf737de6a2 100644
--- a/examples/vctk/voc5/README.md
+++ b/examples/vctk/voc5/README.md
@@ -62,15 +62,13 @@ Here's the complete help message.
```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
- [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
- [--run-benchmark RUN_BENCHMARK]
- [--profiler_options PROFILER_OPTIONS]
+ [--ngpu NGPU]
-Train a ParallelWaveGAN model.
+Train a HiFiGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG HiFiGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
@@ -78,19 +76,6 @@ optional arguments:
--output-dir OUTPUT_DIR
output dir.
--ngpu NGPU if ngpu == 0, use cpu.
-
-benchmark:
- arguments related to benchmark.
-
- --batch-size BATCH_SIZE
- batch size.
- --max-iter MAX_ITER train max steps.
- --run-benchmark RUN_BENCHMARK
- runing benchmark or not, if True, use the --batch-size
- and --max-iter.
- --profiler_options PROFILER_OPTIONS
- The option of profiler, which should be in format
- "key1=value1;key2=value2;key3=value3".
```
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
diff --git a/examples/wenetspeech/asr1/local/extract_meta.py b/examples/wenetspeech/asr1/local/extract_meta.py
index 0e1b2727838052740e5e89593dcdab04ffe387c9..2cad977bee913d6c0db7719ae7d86803e48ac0ff 100644
--- a/examples/wenetspeech/asr1/local/extract_meta.py
+++ b/examples/wenetspeech/asr1/local/extract_meta.py
@@ -1,18 +1,7 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
# Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@@ -24,6 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+
import argparse
import json
import os
diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py
index ddf0359bc5fcb7ff80b437a65112869d7faa12eb..ca6993f2b003054062cb99f37675ad7009f70d32 100644
--- a/paddlespeech/cli/__init__.py
+++ b/paddlespeech/cli/__init__.py
@@ -13,14 +13,7 @@
# limitations under the License.
import _locale
-from .asr import ASRExecutor
from .base_commands import BaseCommand
from .base_commands import HelpCommand
-from .cls import CLSExecutor
-from .st import STExecutor
-from .stats import StatsExecutor
-from .text import TextExecutor
-from .tts import TTSExecutor
-from .vector import VectorExecutor
_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 2d74afa6d72e166aacbe98003ba4db3e80c4b130..92f9b0e41fcda1840a4d87b4a0fb13afc877112e 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -29,30 +29,21 @@ from yacs.config import CfgNode
from ..download import get_path_from_url
from ..executor import BaseExecutor
from ..log import logger
-from ..utils import cli_register
from ..utils import CLI_TIMER
from ..utils import MODEL_HOME
from ..utils import stats_wrapper
from ..utils import timer_register
-from .pretrained_models import model_alias
-from .pretrained_models import pretrained_models
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.transform.transformation import Transformation
-from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.s2t.utils.utility import UpdateConfig
__all__ = ['ASRExecutor']
@timer_register
-@cli_register(
- name='paddlespeech.asr', description='Speech to text infer command.')
class ASRExecutor(BaseExecutor):
def __init__(self):
- super().__init__()
- self.model_alias = model_alias
- self.pretrained_models = pretrained_models
-
+ super().__init__(task='asr', inference_type='offline')
self.parser = argparse.ArgumentParser(
prog='paddlespeech.asr', add_help=True)
self.parser.add_argument(
@@ -62,7 +53,8 @@ class ASRExecutor(BaseExecutor):
type=str,
default='conformer_wenetspeech',
choices=[
- tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+ tag[:tag.index('-')]
+ for tag in self.task_resource.pretrained_models.keys()
],
help='Choose model type of asr task.')
self.parser.add_argument(
@@ -144,14 +136,14 @@ class ASRExecutor(BaseExecutor):
if cfg_path is None or ckpt_path is None:
sample_rate_str = '16k' if sample_rate == 16000 else '8k'
tag = model_type + '-' + lang + '-' + sample_rate_str
- res_path = self._get_pretrained_path(tag) # wenetspeech_zh
- self.res_path = res_path
+ self.task_resource.set_task_model(tag, version=None)
+ self.res_path = self.task_resource.res_dir
self.cfg_path = os.path.join(
- res_path, self.pretrained_models[tag]['cfg_path'])
+ self.res_path, self.task_resource.res_dict['cfg_path'])
self.ckpt_path = os.path.join(
- res_path,
- self.pretrained_models[tag]['ckpt_path'] + ".pdparams")
- logger.info(res_path)
+ self.res_path,
+ self.task_resource.res_dict['ckpt_path'] + ".pdparams")
+ logger.info(self.res_path)
else:
self.cfg_path = os.path.abspath(cfg_path)
@@ -175,8 +167,8 @@ class ASRExecutor(BaseExecutor):
self.collate_fn_test = SpeechCollator.from_config(self.config)
self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type, vocab=self.vocab)
- lm_url = self.pretrained_models[tag]['lm_url']
- lm_md5 = self.pretrained_models[tag]['lm_md5']
+ lm_url = self.task_resource.res_dict['lm_url']
+ lm_md5 = self.task_resource.res_dict['lm_md5']
self.download_lm(
lm_url,
os.path.dirname(self.config.decode.lang_model_path), lm_md5)
@@ -194,7 +186,7 @@ class ASRExecutor(BaseExecutor):
raise Exception("wrong type")
model_name = model_type[:model_type.rindex(
'_')] # model_type: {model_name}_{dataset}
- model_class = dynamic_import(model_name, self.model_alias)
+ model_class = self.task_resource.get_model_class(model_name)
model_conf = self.config
model = model_class.from_config(model_conf)
self.model = model
@@ -441,7 +433,7 @@ class ASRExecutor(BaseExecutor):
if not parser_args.verbose:
self.disable_task_loggers()
- task_source = self.get_task_source(parser_args.input)
+ task_source = self.get_input_source(parser_args.input)
task_results = OrderedDict()
has_exceptions = False
diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py
deleted file mode 100644
index 0f521884020b039a074ad302100a58a59e4d77b1..0000000000000000000000000000000000000000
--- a/paddlespeech/cli/asr/pretrained_models.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
- # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
- # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
- # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
- # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
- "conformer_wenetspeech-zh-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
- 'md5':
- '76cb19ed857e6623856b7cd7ebbfeda4',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/conformer/checkpoints/wenetspeech',
- },
- "conformer_online_wenetspeech-zh-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz',
- 'md5':
- 'b8c02632b04da34aca88459835be54a6',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/chunk_conformer/checkpoints/avg_10',
- },
- "conformer_online_multicn-zh-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.0.model.tar.gz',
- 'md5':
- '7989b3248c898070904cf042fd656003',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/chunk_conformer/checkpoints/multi_cn',
- },
- "conformer_aishell-zh-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz',
- 'md5':
- '3f073eccfa7bb14e0c6867d65fc0dc3a',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/conformer/checkpoints/avg_30',
- },
- "conformer_online_aishell-zh-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz',
- 'md5':
- 'b374cfb93537761270b6224fb0bfc26a',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/chunk_conformer/checkpoints/avg_30',
- },
- "transformer_librispeech-en-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
- 'md5':
- '2c667da24922aad391eacafe37bc1660',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/transformer/checkpoints/avg_10',
- },
- "deepspeech2online_wenetspeech-zh-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz',
- 'md5':
- 'e393d4d274af0f6967db24fc146e8074',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/deepspeech2_online/checkpoints/avg_10',
- 'lm_url':
- 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
- 'lm_md5':
- '29e02312deb2e59b3c8686c7966d4fe3'
- },
- "deepspeech2offline_aishell-zh-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
- 'md5':
- '932c3593d62fe5c741b59b31318aa314',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/deepspeech2/checkpoints/avg_1',
- 'lm_url':
- 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
- 'lm_md5':
- '29e02312deb2e59b3c8686c7966d4fe3'
- },
- "deepspeech2online_aishell-zh-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
- 'md5':
- '98b87b171b7240b7cae6e07d8d0bc9be',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/deepspeech2_online/checkpoints/avg_1',
- 'lm_url':
- 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
- 'lm_md5':
- '29e02312deb2e59b3c8686c7966d4fe3'
- },
- "deepspeech2offline_librispeech-en-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
- 'md5':
- 'f5666c81ad015c8de03aac2bc92e5762',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/deepspeech2/checkpoints/avg_1',
- 'lm_url':
- 'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
- 'lm_md5':
- '099a601759d467cd0a8523ff939819c5'
- },
-}
-
-model_alias = {
- "deepspeech2offline":
- "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
- "deepspeech2online":
- "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
- "conformer":
- "paddlespeech.s2t.models.u2:U2Model",
- "conformer_online":
- "paddlespeech.s2t.models.u2:U2Model",
- "transformer":
- "paddlespeech.s2t.models.u2:U2Model",
- "wenetspeech":
- "paddlespeech.s2t.models.u2:U2Model",
-}
diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py
index 0a26b12030a0b25fe169be5ad3bc61e82c500fa7..39bf24524d27318de2af8d519076f92da4e3db01 100644
--- a/paddlespeech/cli/base_commands.py
+++ b/paddlespeech/cli/base_commands.py
@@ -11,16 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import argparse
from typing import List
+from prettytable import PrettyTable
+
+from ..resource import CommonTaskResource
from .entry import commands
from .utils import cli_register
+from .utils import explicit_command_register
from .utils import get_command
-__all__ = [
- 'BaseCommand',
- 'HelpCommand',
-]
+__all__ = ['BaseCommand', 'HelpCommand', 'StatsCommand']
@cli_register(name='paddlespeech')
@@ -73,3 +75,74 @@ class VersionCommand:
print(msg)
return True
+
+
+model_name_format = {
+ 'asr': 'Model-Language-Sample Rate',
+ 'cls': 'Model-Sample Rate',
+ 'st': 'Model-Source language-Target language',
+ 'text': 'Model-Task-Language',
+ 'tts': 'Model-Language',
+ 'vector': 'Model-Sample Rate'
+}
+
+
+@cli_register(
+ name='paddlespeech.stats',
+ description='Get speech tasks support models list.')
+class StatsCommand:
+ def __init__(self):
+ self.parser = argparse.ArgumentParser(
+ prog='paddlespeech.stats', add_help=True)
+ self.task_choices = ['asr', 'cls', 'st', 'text', 'tts', 'vector']
+ self.parser.add_argument(
+ '--task',
+ type=str,
+ default='asr',
+ choices=self.task_choices,
+ help='Choose speech task.',
+ required=True)
+
+ def show_support_models(self, pretrained_models: dict):
+ fields = model_name_format[self.task].split("-")
+ table = PrettyTable(fields)
+ for key in pretrained_models:
+ table.add_row(key.split("-"))
+ print(table)
+
+ def execute(self, argv: List[str]) -> bool:
+ parser_args = self.parser.parse_args(argv)
+ self.task = parser_args.task
+ if self.task not in self.task_choices:
+ print("Please input correct speech task, choices = " + str(
+ self.task_choices))
+ return
+
+ pretrained_models = CommonTaskResource(task=self.task).pretrained_models
+
+ try:
+ print(
+ "Here is the list of {} pretrained models released by PaddleSpeech that can be used by command line and python API"
+ .format(self.task.upper()))
+ self.show_support_models(pretrained_models)
+ except BaseException:
+ print("Failed to get the list of {} pretrained models.".format(
+ self.task.upper()))
+
+
+# Dynamic import when running specific command
+_commands = {
+ 'asr': ['Speech to text infer command.', 'ASRExecutor'],
+ 'cls': ['Audio classification infer command.', 'CLSExecutor'],
+ 'st': ['Speech translation infer command.', 'STExecutor'],
+ 'text': ['Text command.', 'TextExecutor'],
+ 'tts': ['Text to Speech infer command.', 'TTSExecutor'],
+ 'vector': ['Speech to vector embedding infer command.', 'VectorExecutor'],
+}
+
+for com, info in _commands.items():
+ explicit_command_register(
+ name='paddlespeech.{}'.format(com),
+ description=info[0],
+ cls='paddlespeech.cli.{}.{}'.format(com, info[1]))
+
\ No newline at end of file
diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
index 40072d9974e5798dc5d74b921efb905230e06246..1a9949748f339e838ac1bed7400308aedd8eb1c9 100644
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -21,29 +21,19 @@ from typing import Union
import numpy as np
import paddle
import yaml
-from paddleaudio import load
-from paddleaudio.features import LogMelSpectrogram
-from paddlespeech.utils.dynamic_import import dynamic_import
from ..executor import BaseExecutor
from ..log import logger
-from ..utils import cli_register
from ..utils import stats_wrapper
-from .pretrained_models import model_alias
-from .pretrained_models import pretrained_models
-
+from paddleaudio import load
+from paddleaudio.features import LogMelSpectrogram
__all__ = ['CLSExecutor']
-@cli_register(
- name='paddlespeech.cls', description='Audio classification infer command.')
class CLSExecutor(BaseExecutor):
def __init__(self):
- super().__init__()
- self.model_alias = model_alias
- self.pretrained_models = pretrained_models
-
+ super().__init__(task='cls')
self.parser = argparse.ArgumentParser(
prog='paddlespeech.cls', add_help=True)
self.parser.add_argument(
@@ -53,7 +43,8 @@ class CLSExecutor(BaseExecutor):
type=str,
default='panns_cnn14',
choices=[
- tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+ tag[:tag.index('-')]
+ for tag in self.task_resource.pretrained_models.keys()
],
help='Choose model type of cls task.')
self.parser.add_argument(
@@ -106,13 +97,16 @@ class CLSExecutor(BaseExecutor):
if label_file is None or ckpt_path is None:
tag = model_type + '-' + '32k' # panns_cnn14-32k
- self.res_path = self._get_pretrained_path(tag)
+ self.task_resource.set_task_model(tag, version=None)
self.cfg_path = os.path.join(
- self.res_path, self.pretrained_models[tag]['cfg_path'])
+ self.task_resource.res_dir,
+ self.task_resource.res_dict['cfg_path'])
self.label_file = os.path.join(
- self.res_path, self.pretrained_models[tag]['label_file'])
+ self.task_resource.res_dir,
+ self.task_resource.res_dict['label_file'])
self.ckpt_path = os.path.join(
- self.res_path, self.pretrained_models[tag]['ckpt_path'])
+ self.task_resource.res_dir,
+ self.task_resource.res_dict['ckpt_path'])
else:
self.cfg_path = os.path.abspath(cfg_path)
self.label_file = os.path.abspath(label_file)
@@ -129,7 +123,7 @@ class CLSExecutor(BaseExecutor):
self._label_list.append(line.strip())
# model
- model_class = dynamic_import(model_type, self.model_alias)
+ model_class = self.task_resource.get_model_class(model_type)
model_dict = paddle.load(self.ckpt_path)
self.model = model_class(extract_embedding=False)
self.model.set_state_dict(model_dict)
@@ -206,7 +200,7 @@ class CLSExecutor(BaseExecutor):
if not parser_args.verbose:
self.disable_task_loggers()
- task_source = self.get_task_source(parser_args.input)
+ task_source = self.get_input_source(parser_args.input)
task_results = OrderedDict()
has_exceptions = False
@@ -246,4 +240,4 @@ class CLSExecutor(BaseExecutor):
self.infer()
res = self.postprocess(topk) # Retrieve result of cls.
- return res
\ No newline at end of file
+ return res
diff --git a/paddlespeech/cli/cls/pretrained_models.py b/paddlespeech/cli/cls/pretrained_models.py
deleted file mode 100644
index 1d66850aa7fa55733c8a0680889906894e235126..0000000000000000000000000000000000000000
--- a/paddlespeech/cli/cls/pretrained_models.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
- # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
- # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
- # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
- # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
- "panns_cnn6-32k": {
- 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
- 'md5': '4cf09194a95df024fd12f84712cf0f9c',
- 'cfg_path': 'panns.yaml',
- 'ckpt_path': 'cnn6.pdparams',
- 'label_file': 'audioset_labels.txt',
- },
- "panns_cnn10-32k": {
- 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
- 'md5': 'cb8427b22176cc2116367d14847f5413',
- 'cfg_path': 'panns.yaml',
- 'ckpt_path': 'cnn10.pdparams',
- 'label_file': 'audioset_labels.txt',
- },
- "panns_cnn14-32k": {
- 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
- 'md5': 'e3b9b5614a1595001161d0ab95edee97',
- 'cfg_path': 'panns.yaml',
- 'ckpt_path': 'cnn14.pdparams',
- 'label_file': 'audioset_labels.txt',
- },
-}
-
-model_alias = {
- "panns_cnn6": "paddlespeech.cls.models.panns:CNN6",
- "panns_cnn10": "paddlespeech.cls.models.panns:CNN10",
- "panns_cnn14": "paddlespeech.cls.models.panns:CNN14",
-}
diff --git a/paddlespeech/cli/entry.py b/paddlespeech/cli/entry.py
index 32123ece750457dac8ca90aff1a8731fea569188..e0c306d62a7d55b8a48a147fa7d13dcda866ab79 100644
--- a/paddlespeech/cli/entry.py
+++ b/paddlespeech/cli/entry.py
@@ -34,6 +34,11 @@ def _execute():
# The method 'execute' of a command instance returns 'True' for a success
# while 'False' for a failure. Here converts this result into a exit status
# in bash: 0 for a success and 1 for a failure.
+ if not callable(com['_entry']):
+ i = com['_entry'].rindex('.')
+ module, cls = com['_entry'][:i], com['_entry'][i + 1:]
+ exec("from {} import {}".format(module, cls))
+ com['_entry'] = locals()[cls]
status = 0 if com['_entry']().execute(sys.argv[idx:]) else 1
return status
diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
index 4a631c7f5b8d73ad1095a3d00b8ddbbc9615a8e5..d390f947d17cccc99a12eee75f634242e4bac9bb 100644
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -24,9 +24,8 @@ from typing import Union
import paddle
+from ..resource import CommonTaskResource
from .log import logger
-from .utils import download_and_decompress
-from .utils import MODEL_HOME
class BaseExecutor(ABC):
@@ -34,11 +33,10 @@ class BaseExecutor(ABC):
An abstract executor of paddlespeech tasks.
"""
- def __init__(self):
+ def __init__(self, task: str, **kwargs):
self._inputs = OrderedDict()
self._outputs = OrderedDict()
- self.pretrained_models = OrderedDict()
- self.model_alias = OrderedDict()
+ self.task_resource = CommonTaskResource(task=task, **kwargs)
@abstractmethod
def _init_from_path(self, *args, **kwargs):
@@ -98,8 +96,8 @@ class BaseExecutor(ABC):
"""
pass
- def get_task_source(self, input_: Union[str, os.PathLike, None]
- ) -> Dict[str, Union[str, os.PathLike]]:
+ def get_input_source(self, input_: Union[str, os.PathLike, None]
+ ) -> Dict[str, Union[str, os.PathLike]]:
"""
Get task input source from command line input.
@@ -115,15 +113,17 @@ class BaseExecutor(ABC):
ret = OrderedDict()
if input_ is None: # Take input from stdin
- for i, line in enumerate(sys.stdin):
- line = line.strip()
- if len(line.split(' ')) == 1:
- ret[str(i + 1)] = line
- elif len(line.split(' ')) == 2:
- id_, info = line.split(' ')
- ret[id_] = info
- else: # No valid input info from one line.
- continue
+ if not sys.stdin.isatty(
+ ): # Avoid getting stuck when stdin is empty.
+ for i, line in enumerate(sys.stdin):
+ line = line.strip()
+ if len(line.split(' ')) == 1:
+ ret[str(i + 1)] = line
+ elif len(line.split(' ')) == 2:
+ id_, info = line.split(' ')
+ ret[id_] = info
+ else: # No valid input info from one line.
+ continue
else:
ret[1] = input_
return ret
@@ -219,23 +219,6 @@ class BaseExecutor(ABC):
for l in loggers:
l.disabled = True
- def _get_pretrained_path(self, tag: str) -> os.PathLike:
- """
- Download and returns pretrained resources path of current task.
- """
- support_models = list(self.pretrained_models.keys())
- assert tag in self.pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
- tag, '\n\t\t'.join(support_models))
-
- res_path = os.path.join(MODEL_HOME, tag)
- decompressed_path = download_and_decompress(self.pretrained_models[tag],
- res_path)
- decompressed_path = os.path.abspath(decompressed_path)
- logger.info(
- 'Use pretrained model stored in: {}'.format(decompressed_path))
-
- return decompressed_path
-
def show_rtf(self, info: Dict[str, List[float]]):
"""
Calculate rft of current task and show results.
diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py
index 4f210fbe685df50236379e196639395b2ce4adf2..e1ce181af351c4bf651a913d2de7005c5dc37e51 100644
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -28,27 +28,25 @@ from yacs.config import CfgNode
from ..executor import BaseExecutor
from ..log import logger
-from ..utils import cli_register
from ..utils import download_and_decompress
from ..utils import MODEL_HOME
from ..utils import stats_wrapper
-from .pretrained_models import kaldi_bins
-from .pretrained_models import model_alias
-from .pretrained_models import pretrained_models
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.utils.utility import UpdateConfig
-from paddlespeech.utils.dynamic_import import dynamic_import
__all__ = ["STExecutor"]
+kaldi_bins = {
+ "url":
+ "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
+ "md5":
+ "c0682303b3f3393dbf6ed4c4e35a53eb",
+}
+
-@cli_register(
- name="paddlespeech.st", description="Speech translation infer command.")
class STExecutor(BaseExecutor):
def __init__(self):
- super().__init__()
- self.model_alias = model_alias
- self.pretrained_models = pretrained_models
+ super().__init__(task='st')
self.kaldi_bins = kaldi_bins
self.parser = argparse.ArgumentParser(
@@ -60,7 +58,8 @@ class STExecutor(BaseExecutor):
type=str,
default="fat_st_ted",
choices=[
- tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+ tag[:tag.index('-')]
+ for tag in self.task_resource.pretrained_models.keys()
],
help="Choose model type of st task.")
self.parser.add_argument(
@@ -134,14 +133,16 @@ class STExecutor(BaseExecutor):
if cfg_path is None or ckpt_path is None:
tag = model_type + "-" + src_lang + "-" + tgt_lang
- res_path = self._get_pretrained_path(tag)
- self.cfg_path = os.path.join(res_path,
- pretrained_models[tag]["cfg_path"])
- self.ckpt_path = os.path.join(res_path,
- pretrained_models[tag]["ckpt_path"])
- logger.info(res_path)
+ self.task_resource.set_task_model(tag, version=None)
+ self.cfg_path = os.path.join(
+ self.task_resource.res_dir,
+ self.task_resource.res_dict['cfg_path'])
+ self.ckpt_path = os.path.join(
+ self.task_resource.res_dir,
+ self.task_resource.res_dict['ckpt_path'])
logger.info(self.cfg_path)
logger.info(self.ckpt_path)
+ res_path = self.task_resource.res_dir
else:
self.cfg_path = os.path.abspath(cfg_path)
self.ckpt_path = os.path.abspath(ckpt_path)
@@ -166,7 +167,7 @@ class STExecutor(BaseExecutor):
model_conf = self.config
model_name = model_type[:model_type.rindex(
'_')] # model_type: {model_name}_{dataset}
- model_class = dynamic_import(model_name, self.model_alias)
+ model_class = self.task_resource.get_model_class(model_name)
self.model = model_class.from_config(model_conf)
self.model.eval()
@@ -304,7 +305,7 @@ class STExecutor(BaseExecutor):
if not parser_args.verbose:
self.disable_task_loggers()
- task_source = self.get_task_source(parser_args.input)
+ task_source = self.get_input_source(parser_args.input)
task_results = OrderedDict()
has_exceptions = False
diff --git a/paddlespeech/cli/st/pretrained_models.py b/paddlespeech/cli/st/pretrained_models.py
deleted file mode 100644
index cc7410d253f34109424e49ea0d2622e12ce93ea5..0000000000000000000000000000000000000000
--- a/paddlespeech/cli/st/pretrained_models.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
- "fat_st_ted-en-zh": {
- "url":
- "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz",
- "md5":
- "d62063f35a16d91210a71081bd2dd557",
- "cfg_path":
- "model.yaml",
- "ckpt_path":
- "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
- }
-}
-
-model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"}
-
-kaldi_bins = {
- "url":
- "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
- "md5":
- "c0682303b3f3393dbf6ed4c4e35a53eb",
-}
diff --git a/paddlespeech/cli/stats/infer.py b/paddlespeech/cli/stats/infer.py
deleted file mode 100644
index 7cf4f2368cbced90bac54cb61bdc1bd8fc3d07f8..0000000000000000000000000000000000000000
--- a/paddlespeech/cli/stats/infer.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-from typing import List
-
-from prettytable import PrettyTable
-
-from ..utils import cli_register
-from ..utils import stats_wrapper
-
-__all__ = ['StatsExecutor']
-
-model_name_format = {
- 'asr': 'Model-Language-Sample Rate',
- 'cls': 'Model-Sample Rate',
- 'st': 'Model-Source language-Target language',
- 'text': 'Model-Task-Language',
- 'tts': 'Model-Language',
- 'vector': 'Model-Sample Rate'
-}
-
-
-@cli_register(
- name='paddlespeech.stats',
- description='Get speech tasks support models list.')
-class StatsExecutor():
- def __init__(self):
- super().__init__()
-
- self.parser = argparse.ArgumentParser(
- prog='paddlespeech.stats', add_help=True)
- self.task_choices = ['asr', 'cls', 'st', 'text', 'tts', 'vector']
- self.parser.add_argument(
- '--task',
- type=str,
- default='asr',
- choices=self.task_choices,
- help='Choose speech task.',
- required=True)
-
- def show_support_models(self, pretrained_models: dict):
- fields = model_name_format[self.task].split("-")
- table = PrettyTable(fields)
- for key in pretrained_models:
- table.add_row(key.split("-"))
- print(table)
-
- def execute(self, argv: List[str]) -> bool:
- """
- Command line entry.
- """
- parser_args = self.parser.parse_args(argv)
- has_exceptions = False
- try:
- self(parser_args.task)
- except Exception as e:
- has_exceptions = True
- if has_exceptions:
- return False
- else:
- return True
-
- @stats_wrapper
- def __call__(
- self,
- task: str=None, ):
- """
- Python API to call an executor.
- """
- self.task = task
- if self.task not in self.task_choices:
- print("Please input correct speech task, choices = " + str(
- self.task_choices))
-
- elif self.task == 'asr':
- try:
- from ..asr.pretrained_models import pretrained_models
- print(
- "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
- )
- self.show_support_models(pretrained_models)
- except BaseException:
- print("Failed to get the list of ASR pretrained models.")
-
- elif self.task == 'cls':
- try:
- from ..cls.pretrained_models import pretrained_models
- print(
- "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
- )
- self.show_support_models(pretrained_models)
- except BaseException:
- print("Failed to get the list of CLS pretrained models.")
-
- elif self.task == 'st':
- try:
- from ..st.pretrained_models import pretrained_models
- print(
- "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
- )
- self.show_support_models(pretrained_models)
- except BaseException:
- print("Failed to get the list of ST pretrained models.")
-
- elif self.task == 'text':
- try:
- from ..text.pretrained_models import pretrained_models
- print(
- "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
- )
- self.show_support_models(pretrained_models)
- except BaseException:
- print("Failed to get the list of TEXT pretrained models.")
-
- elif self.task == 'tts':
- try:
- from ..tts.pretrained_models import pretrained_models
- print(
- "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
- )
- self.show_support_models(pretrained_models)
- except BaseException:
- print("Failed to get the list of TTS pretrained models.")
-
- elif self.task == 'vector':
- try:
- from ..vector.pretrained_models import pretrained_models
- print(
- "Here is the list of Speaker Recognition pretrained models released by PaddleSpeech that can be used by command line and python API"
- )
- self.show_support_models(pretrained_models)
- except BaseException:
- print(
- "Failed to get the list of Speaker Recognition pretrained models."
- )
diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py
index 97f3bbe21346dfa6651c773e44eb293c4baa841a..7b8faf99c84691971744fbef291a714900dc60bc 100644
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@@ -23,24 +23,14 @@ import paddle
from ..executor import BaseExecutor
from ..log import logger
-from ..utils import cli_register
from ..utils import stats_wrapper
-from .pretrained_models import model_alias
-from .pretrained_models import pretrained_models
-from .pretrained_models import tokenizer_alias
-from paddlespeech.utils.dynamic_import import dynamic_import
__all__ = ['TextExecutor']
-@cli_register(name='paddlespeech.text', description='Text infer command.')
class TextExecutor(BaseExecutor):
def __init__(self):
- super().__init__()
- self.model_alias = model_alias
- self.pretrained_models = pretrained_models
- self.tokenizer_alias = tokenizer_alias
-
+ super().__init__(task='text')
self.parser = argparse.ArgumentParser(
prog='paddlespeech.text', add_help=True)
self.parser.add_argument(
@@ -56,7 +46,8 @@ class TextExecutor(BaseExecutor):
type=str,
default='ernie_linear_p7_wudao',
choices=[
- tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+ tag[:tag.index('-')]
+ for tag in self.task_resource.pretrained_models.keys()
],
help='Choose model type of text task.')
self.parser.add_argument(
@@ -114,13 +105,16 @@ class TextExecutor(BaseExecutor):
if cfg_path is None or ckpt_path is None or vocab_file is None:
tag = '-'.join([model_type, task, lang])
- self.res_path = self._get_pretrained_path(tag)
+ self.task_resource.set_task_model(tag, version=None)
self.cfg_path = os.path.join(
- self.res_path, self.pretrained_models[tag]['cfg_path'])
+ self.task_resource.res_dir,
+ self.task_resource.res_dict['cfg_path'])
self.ckpt_path = os.path.join(
- self.res_path, self.pretrained_models[tag]['ckpt_path'])
+ self.task_resource.res_dir,
+ self.task_resource.res_dict['ckpt_path'])
self.vocab_file = os.path.join(
- self.res_path, self.pretrained_models[tag]['vocab_file'])
+ self.task_resource.res_dir,
+ self.task_resource.res_dict['vocab_file'])
else:
self.cfg_path = os.path.abspath(cfg_path)
self.ckpt_path = os.path.abspath(ckpt_path)
@@ -135,8 +129,8 @@ class TextExecutor(BaseExecutor):
self._punc_list.append(line.strip())
# model
- model_class = dynamic_import(model_name, self.model_alias)
- tokenizer_class = dynamic_import(model_name, self.tokenizer_alias)
+ model_class, tokenizer_class = self.task_resource.get_model_class(
+ model_name)
self.model = model_class(
cfg_path=self.cfg_path, ckpt_path=self.ckpt_path)
self.tokenizer = tokenizer_class.from_pretrained('ernie-1.0')
@@ -226,7 +220,7 @@ class TextExecutor(BaseExecutor):
if not parser_args.verbose:
self.disable_task_loggers()
- task_source = self.get_task_source(parser_args.input)
+ task_source = self.get_input_source(parser_args.input)
task_results = OrderedDict()
has_exceptions = False
diff --git a/paddlespeech/cli/text/pretrained_models.py b/paddlespeech/cli/text/pretrained_models.py
deleted file mode 100644
index 817d3caa3cdc634a202703d4885796b21eee4f56..0000000000000000000000000000000000000000
--- a/paddlespeech/cli/text/pretrained_models.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
- # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
- # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
- # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
- # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
- "ernie_linear_p7_wudao-punc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz',
- 'md5':
- '12283e2ddde1797c5d1e57036b512746',
- 'cfg_path':
- 'ckpt/model_config.json',
- 'ckpt_path':
- 'ckpt/model_state.pdparams',
- 'vocab_file':
- 'punc_vocab.txt',
- },
- "ernie_linear_p3_wudao-punc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz',
- 'md5':
- '448eb2fdf85b6a997e7e652e80c51dd2',
- 'cfg_path':
- 'ckpt/model_config.json',
- 'ckpt_path':
- 'ckpt/model_state.pdparams',
- 'vocab_file':
- 'punc_vocab.txt',
- },
-}
-
-model_alias = {
- "ernie_linear_p7": "paddlespeech.text.models:ErnieLinear",
- "ernie_linear_p3": "paddlespeech.text.models:ErnieLinear",
-}
-
-tokenizer_alias = {
- "ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer",
- "ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer",
-}
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index efab9cb258a34c8ee7f003fd6f1a21fe8e77a126..4e0337bccea500c382f0782860cec36ad4897c46 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -28,26 +28,17 @@ from yacs.config import CfgNode
from ..executor import BaseExecutor
from ..log import logger
-from ..utils import cli_register
from ..utils import stats_wrapper
-from .pretrained_models import model_alias
-from .pretrained_models import pretrained_models
from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.modules.normalizer import ZScore
-from paddlespeech.utils.dynamic_import import dynamic_import
__all__ = ['TTSExecutor']
-@cli_register(
- name='paddlespeech.tts', description='Text to Speech infer command.')
class TTSExecutor(BaseExecutor):
def __init__(self):
- super().__init__()
- self.model_alias = model_alias
- self.pretrained_models = pretrained_models
-
+ super().__init__('tts')
self.parser = argparse.ArgumentParser(
prog='paddlespeech.tts', add_help=True)
self.parser.add_argument(
@@ -186,19 +177,23 @@ class TTSExecutor(BaseExecutor):
return
# am
am_tag = am + '-' + lang
+ self.task_resource.set_task_model(
+ model_tag=am_tag,
+ model_type=0, # am
+ version=None, # default version
+ )
if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None:
- am_res_path = self._get_pretrained_path(am_tag)
- self.am_res_path = am_res_path
- self.am_config = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['config'])
- self.am_ckpt = os.path.join(am_res_path,
- self.pretrained_models[am_tag]['ckpt'])
+ self.am_res_path = self.task_resource.res_dir
+ self.am_config = os.path.join(self.am_res_path,
+ self.task_resource.res_dict['config'])
+ self.am_ckpt = os.path.join(self.am_res_path,
+ self.task_resource.res_dict['ckpt'])
self.am_stat = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['speech_stats'])
+ self.am_res_path, self.task_resource.res_dict['speech_stats'])
# must have phones_dict in acoustic
self.phones_dict = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['phones_dict'])
- logger.info(am_res_path)
+ self.am_res_path, self.task_resource.res_dict['phones_dict'])
+ logger.info(self.am_res_path)
logger.info(self.am_config)
logger.info(self.am_ckpt)
else:
@@ -210,32 +205,37 @@ class TTSExecutor(BaseExecutor):
# for speedyspeech
self.tones_dict = None
- if 'tones_dict' in self.pretrained_models[am_tag]:
+ if 'tones_dict' in self.task_resource.res_dict:
self.tones_dict = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['tones_dict'])
+ self.am_res_path, self.task_resource.res_dict['tones_dict'])
if tones_dict:
self.tones_dict = tones_dict
# for multi speaker fastspeech2
self.speaker_dict = None
- if 'speaker_dict' in self.pretrained_models[am_tag]:
+ if 'speaker_dict' in self.task_resource.res_dict:
self.speaker_dict = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['speaker_dict'])
+ self.am_res_path, self.task_resource.res_dict['speaker_dict'])
if speaker_dict:
self.speaker_dict = speaker_dict
# voc
voc_tag = voc + '-' + lang
+ self.task_resource.set_task_model(
+ model_tag=voc_tag,
+ model_type=1, # vocoder
+ version=None, # default version
+ )
if voc_ckpt is None or voc_config is None or voc_stat is None:
- voc_res_path = self._get_pretrained_path(voc_tag)
- self.voc_res_path = voc_res_path
+ self.voc_res_path = self.task_resource.voc_res_dir
self.voc_config = os.path.join(
- voc_res_path, self.pretrained_models[voc_tag]['config'])
+ self.voc_res_path, self.task_resource.voc_res_dict['config'])
self.voc_ckpt = os.path.join(
- voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
+ self.voc_res_path, self.task_resource.voc_res_dict['ckpt'])
self.voc_stat = os.path.join(
- voc_res_path, self.pretrained_models[voc_tag]['speech_stats'])
- logger.info(voc_res_path)
+ self.voc_res_path,
+ self.task_resource.voc_res_dict['speech_stats'])
+ logger.info(self.voc_res_path)
logger.info(self.voc_config)
logger.info(self.voc_ckpt)
else:
@@ -285,9 +285,9 @@ class TTSExecutor(BaseExecutor):
# model: {model_name}_{dataset}
am_name = am[:am.rindex('_')]
- am_class = dynamic_import(am_name, self.model_alias)
- am_inference_class = dynamic_import(am_name + '_inference',
- self.model_alias)
+ am_class = self.task_resource.get_model_class(am_name)
+ am_inference_class = self.task_resource.get_model_class(am_name +
+ '_inference')
if am_name == 'fastspeech2':
am = am_class(
@@ -316,9 +316,9 @@ class TTSExecutor(BaseExecutor):
# vocoder
# model: {model_name}_{dataset}
voc_name = voc[:voc.rindex('_')]
- voc_class = dynamic_import(voc_name, self.model_alias)
- voc_inference_class = dynamic_import(voc_name + '_inference',
- self.model_alias)
+ voc_class = self.task_resource.get_model_class(voc_name)
+ voc_inference_class = self.task_resource.get_model_class(voc_name +
+ '_inference')
if voc_name != 'wavernn':
voc = voc_class(**self.voc_config["generator_params"])
voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"])
@@ -446,7 +446,7 @@ class TTSExecutor(BaseExecutor):
if not args.verbose:
self.disable_task_loggers()
- task_source = self.get_task_source(args.input)
+ task_source = self.get_input_source(args.input)
task_results = OrderedDict()
has_exceptions = False
diff --git a/paddlespeech/cli/tts/pretrained_models.py b/paddlespeech/cli/tts/pretrained_models.py
deleted file mode 100644
index 65254a9353fc997038d84368d3918f055d2ccee0..0000000000000000000000000000000000000000
--- a/paddlespeech/cli/tts/pretrained_models.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
- # speedyspeech
- "speedyspeech_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
- 'md5':
- '6f6fa967b408454b6662c8c00c0027cb',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_30600.pdz',
- 'speech_stats':
- 'feats_stats.npy',
- 'phones_dict':
- 'phone_id_map.txt',
- 'tones_dict':
- 'tone_id_map.txt',
- },
-
- # fastspeech2
- "fastspeech2_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
- 'md5':
- '637d28a5e53aa60275612ba4393d5f22',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_76000.pdz',
- 'speech_stats':
- 'speech_stats.npy',
- 'phones_dict':
- 'phone_id_map.txt',
- },
- "fastspeech2_ljspeech-en": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip',
- 'md5':
- 'ffed800c93deaf16ca9b3af89bfcd747',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_100000.pdz',
- 'speech_stats':
- 'speech_stats.npy',
- 'phones_dict':
- 'phone_id_map.txt',
- },
- "fastspeech2_aishell3-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip',
- 'md5':
- 'f4dd4a5f49a4552b77981f544ab3392e',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_96400.pdz',
- 'speech_stats':
- 'speech_stats.npy',
- 'phones_dict':
- 'phone_id_map.txt',
- 'speaker_dict':
- 'speaker_id_map.txt',
- },
- "fastspeech2_vctk-en": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip',
- 'md5':
- '743e5024ca1e17a88c5c271db9779ba4',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_66200.pdz',
- 'speech_stats':
- 'speech_stats.npy',
- 'phones_dict':
- 'phone_id_map.txt',
- 'speaker_dict':
- 'speaker_id_map.txt',
- },
- # tacotron2
- "tacotron2_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip',
- 'md5':
- '0df4b6f0bcbe0d73c5ed6df8867ab91a',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_30600.pdz',
- 'speech_stats':
- 'speech_stats.npy',
- 'phones_dict':
- 'phone_id_map.txt',
- },
- "tacotron2_ljspeech-en": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip',
- 'md5':
- '6a5eddd81ae0e81d16959b97481135f3',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_60300.pdz',
- 'speech_stats':
- 'speech_stats.npy',
- 'phones_dict':
- 'phone_id_map.txt',
- },
-
- # pwgan
- "pwgan_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip',
- 'md5':
- '2e481633325b5bdf0a3823c714d2c117',
- 'config':
- 'pwg_default.yaml',
- 'ckpt':
- 'pwg_snapshot_iter_400000.pdz',
- 'speech_stats':
- 'pwg_stats.npy',
- },
- "pwgan_ljspeech-en": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip',
- 'md5':
- '53610ba9708fd3008ccaf8e99dacbaf0',
- 'config':
- 'pwg_default.yaml',
- 'ckpt':
- 'pwg_snapshot_iter_400000.pdz',
- 'speech_stats':
- 'pwg_stats.npy',
- },
- "pwgan_aishell3-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip',
- 'md5':
- 'd7598fa41ad362d62f85ffc0f07e3d84',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_1000000.pdz',
- 'speech_stats':
- 'feats_stats.npy',
- },
- "pwgan_vctk-en": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip',
- 'md5':
- 'b3da1defcde3e578be71eb284cb89f2c',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_1500000.pdz',
- 'speech_stats':
- 'feats_stats.npy',
- },
- # mb_melgan
- "mb_melgan_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
- 'md5':
- 'ee5f0604e20091f0d495b6ec4618b90d',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_1000000.pdz',
- 'speech_stats':
- 'feats_stats.npy',
- },
- # style_melgan
- "style_melgan_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip',
- 'md5':
- '5de2d5348f396de0c966926b8c462755',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_1500000.pdz',
- 'speech_stats':
- 'feats_stats.npy',
- },
- # hifigan
- "hifigan_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
- 'md5':
- 'dd40a3d88dfcf64513fba2f0f961ada6',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_2500000.pdz',
- 'speech_stats':
- 'feats_stats.npy',
- },
- "hifigan_ljspeech-en": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
- 'md5':
- '70e9131695decbca06a65fe51ed38a72',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_2500000.pdz',
- 'speech_stats':
- 'feats_stats.npy',
- },
- "hifigan_aishell3-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
- 'md5':
- '3bb49bc75032ed12f79c00c8cc79a09a',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_2500000.pdz',
- 'speech_stats':
- 'feats_stats.npy',
- },
- "hifigan_vctk-en": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
- 'md5':
- '7da8f88359bca2457e705d924cf27bd4',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_2500000.pdz',
- 'speech_stats':
- 'feats_stats.npy',
- },
-
- # wavernn
- "wavernn_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip',
- 'md5':
- 'ee37b752f09bcba8f2af3b777ca38e13',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_400000.pdz',
- 'speech_stats':
- 'feats_stats.npy',
- }
-}
-
-model_alias = {
- # acoustic model
- "speedyspeech":
- "paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
- "speedyspeech_inference":
- "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
- "fastspeech2":
- "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
- "fastspeech2_inference":
- "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
- "tacotron2":
- "paddlespeech.t2s.models.tacotron2:Tacotron2",
- "tacotron2_inference":
- "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
- # voc
- "pwgan":
- "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
- "pwgan_inference":
- "paddlespeech.t2s.models.parallel_wavegan:PWGInference",
- "mb_melgan":
- "paddlespeech.t2s.models.melgan:MelGANGenerator",
- "mb_melgan_inference":
- "paddlespeech.t2s.models.melgan:MelGANInference",
- "style_melgan":
- "paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
- "style_melgan_inference":
- "paddlespeech.t2s.models.melgan:StyleMelGANInference",
- "hifigan":
- "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
- "hifigan_inference":
- "paddlespeech.t2s.models.hifigan:HiFiGANInference",
- "wavernn":
- "paddlespeech.t2s.models.wavernn:WaveRNN",
- "wavernn_inference":
- "paddlespeech.t2s.models.wavernn:WaveRNNInference",
-}
diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py
index e7b499f728c3d93ecdfc3bd8fdf92559ce59845a..128767e627091dc636da2900e5e65b58bdd650ca 100644
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@@ -41,6 +41,7 @@ requests.adapters.DEFAULT_RETRIES = 3
__all__ = [
'timer_register',
'cli_register',
+ 'explicit_command_register',
'get_command',
'download_and_decompress',
'load_state_dict_from_url',
@@ -70,6 +71,16 @@ def cli_register(name: str, description: str='') -> Any:
return _warpper
+def explicit_command_register(name: str, description: str='', cls: str=''):
+ items = name.split('.')
+ com = commands
+ for item in items:
+ com = com[item]
+ com['_entry'] = cls
+ if description:
+ com['_description'] = description
+
+
def get_command(name: str) -> Any:
items = name.split('.')
com = commands
diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index cc664369fa2a82f929e8112610955e1aa727a6d2..56f86f9b8676a6416e6c30e7b5a78f04d49b3963 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -22,30 +22,20 @@ from typing import Union
import paddle
import soundfile
-from paddleaudio.backends import load as load_audio
-from paddleaudio.compliance.librosa import melspectrogram
from yacs.config import CfgNode
from ..executor import BaseExecutor
from ..log import logger
-from ..utils import cli_register
from ..utils import stats_wrapper
-from .pretrained_models import model_alias
-from .pretrained_models import pretrained_models
-from paddlespeech.utils.dynamic_import import dynamic_import
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
from paddlespeech.vector.io.batch import feature_normalize
from paddlespeech.vector.modules.sid_model import SpeakerIdetification
-@cli_register(
- name="paddlespeech.vector",
- description="Speech to vector embedding infer command.")
class VectorExecutor(BaseExecutor):
def __init__(self):
- super().__init__()
- self.model_alias = model_alias
- self.pretrained_models = pretrained_models
-
+ super().__init__('vector')
self.parser = argparse.ArgumentParser(
prog="paddlespeech.vector", add_help=True)
@@ -53,7 +43,10 @@ class VectorExecutor(BaseExecutor):
"--model",
type=str,
default="ecapatdnn_voxceleb12",
- choices=["ecapatdnn_voxceleb12"],
+ choices=[
+ tag[:tag.index('-')]
+ for tag in self.task_resource.pretrained_models.keys()
+ ],
help="Choose model type of vector task.")
self.parser.add_argument(
"--task",
@@ -123,7 +116,7 @@ class VectorExecutor(BaseExecutor):
self.disable_task_loggers()
# stage 2: read the input data and store them as a list
- task_source = self.get_task_source(parser_args.input)
+ task_source = self.get_input_source(parser_args.input)
logger.info(f"task source: {task_source}")
# stage 3: process the audio one by one
@@ -300,17 +293,18 @@ class VectorExecutor(BaseExecutor):
# get the mode from pretrained list
sample_rate_str = "16k" if sample_rate == 16000 else "8k"
tag = model_type + "-" + sample_rate_str
+ self.task_resource.set_task_model(tag, version=None)
logger.info(f"load the pretrained model: {tag}")
# get the model from the pretrained list
# we download the pretrained model and store it in the res_path
- res_path = self._get_pretrained_path(tag)
- self.res_path = res_path
+ self.res_path = self.task_resource.res_dir
self.cfg_path = os.path.join(
- res_path, self.pretrained_models[tag]['cfg_path'])
+ self.task_resource.res_dir,
+ self.task_resource.res_dict['cfg_path'])
self.ckpt_path = os.path.join(
- res_path,
- self.pretrained_models[tag]['ckpt_path'] + '.pdparams')
+ self.task_resource.res_dir,
+ self.task_resource.res_dict['ckpt_path'] + '.pdparams')
else:
# get the model from disk
self.cfg_path = os.path.abspath(cfg_path)
@@ -329,8 +323,8 @@ class VectorExecutor(BaseExecutor):
# stage 3: get the model name to instance the model network with dynamic_import
logger.info("start to dynamic import the model class")
model_name = model_type[:model_type.rindex('_')]
+ model_class = self.task_resource.get_model_class(model_name)
logger.info(f"model name {model_name}")
- model_class = dynamic_import(model_name, self.model_alias)
model_conf = self.config.model
backbone = model_class(**model_conf)
model = SpeakerIdetification(
@@ -476,4 +470,4 @@ class VectorExecutor(BaseExecutor):
else:
logger.info("The audio file format is right")
- return True
\ No newline at end of file
+ return True
diff --git a/paddlespeech/cli/vector/pretrained_models.py b/paddlespeech/cli/vector/pretrained_models.py
deleted file mode 100644
index 4d1d3a048b22550fa85d77c4a8d5fae5b39a56e2..0000000000000000000000000000000000000000
--- a/paddlespeech/cli/vector/pretrained_models.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
- # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
- # e.g. "ecapatdnn_voxceleb12-16k".
- # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
- # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav"
- "ecapatdnn_voxceleb12-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_1.tar.gz',
- 'md5':
- '67c7ff8885d5246bd16e0f5ac1cba99f',
- 'cfg_path':
- 'conf/model.yaml', # the yaml config path
- 'ckpt_path':
- 'model/model', # the format is ${dir}/{model_name},
- # so the first 'model' is dir, the second 'model' is the name
- # this means we have a model stored as model/model.pdparams
- },
-}
-
-model_alias = {
- "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
-}
diff --git a/paddlespeech/kws/exps/mdtc/compute_det.py b/paddlespeech/kws/exps/mdtc/compute_det.py
index e43a953dbc35246f295a79790b2da55837318114..853056966376cd7551228ebbabdd7bddf334189e 100644
--- a/paddlespeech/kws/exps/mdtc/compute_det.py
+++ b/paddlespeech/kws/exps/mdtc/compute_det.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com)
+# 2022 Shaoqing Yu(954793264@qq.com)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/kws/exps/mdtc/plot_det_curve.py b/paddlespeech/kws/exps/mdtc/plot_det_curve.py
index a3ea21eff9773f922c789cccaf6b5d5f02fed5c5..4960281eed3b87fe1fd43374382a09526799a663 100644
--- a/paddlespeech/kws/exps/mdtc/plot_det_curve.py
+++ b/paddlespeech/kws/exps/mdtc/plot_det_curve.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com)
+# Menglong Xu
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/kws/exps/mdtc/score.py b/paddlespeech/kws/exps/mdtc/score.py
index 1b5e1e2967f62c5f6259ddc40ad8929fd61d1a7c..556455ca19e135b2c9eee4a2523e42c0a2ad1d0e 100644
--- a/paddlespeech/kws/exps/mdtc/score.py
+++ b/paddlespeech/kws/exps/mdtc/score.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com)
+# 2022 Shaoqing Yu(954793264@qq.com)
+# 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/kws/models/loss.py b/paddlespeech/kws/models/loss.py
index 64c9a32c9f128338081999ebd209719e90fcf98c..bda77f2ba54cc26523a47b33f6ced1be748a2671 100644
--- a/paddlespeech/kws/models/loss.py
+++ b/paddlespeech/kws/models/loss.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2021 Binbin Zhang
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/kws/models/mdtc.py b/paddlespeech/kws/models/mdtc.py
index 5d2e5de649c5adc43536e15f1867d19e75b589f6..c605a02b6d0167f9a07267ab9d9f3e5b981e0e5f 100644
--- a/paddlespeech/kws/models/mdtc.py
+++ b/paddlespeech/kws/models/mdtc.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2021 Jingyong Hou (houjingyong@gmail.com)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/cli/stats/__init__.py b/paddlespeech/resource/__init__.py
similarity index 83%
rename from paddlespeech/cli/stats/__init__.py
rename to paddlespeech/resource/__init__.py
index 9fe6c4abaf10de2f24f751ddd62f456768a82475..e143413af7a7cecb59b10b80296ec8d95490b14a 100644
--- a/paddlespeech/cli/stats/__init__.py
+++ b/paddlespeech/resource/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -11,4 +11,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from .infer import StatsExecutor
+from .resource import CommonTaskResource
diff --git a/paddlespeech/resource/model_alias.py b/paddlespeech/resource/model_alias.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b19ed065bc103ccb68e4722fab24e81b443007d
--- /dev/null
+++ b/paddlespeech/resource/model_alias.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+ 'model_alias',
+]
+
+# Records of model name to import class
+model_alias = {
+ # ---------------------------------
+ # -------------- ASR --------------
+ # ---------------------------------
+ "deepspeech2offline": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
+ "deepspeech2online":
+ ["paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline"],
+ "conformer": ["paddlespeech.s2t.models.u2:U2Model"],
+ "conformer_online": ["paddlespeech.s2t.models.u2:U2Model"],
+ "transformer": ["paddlespeech.s2t.models.u2:U2Model"],
+ "wenetspeech": ["paddlespeech.s2t.models.u2:U2Model"],
+
+ # ---------------------------------
+ # -------------- CLS --------------
+ # ---------------------------------
+ "panns_cnn6": ["paddlespeech.cls.models.panns:CNN6"],
+ "panns_cnn10": ["paddlespeech.cls.models.panns:CNN10"],
+ "panns_cnn14": ["paddlespeech.cls.models.panns:CNN14"],
+
+ # ---------------------------------
+ # -------------- ST ---------------
+ # ---------------------------------
+ "fat_st": ["paddlespeech.s2t.models.u2_st:U2STModel"],
+
+ # ---------------------------------
+ # -------------- TEXT -------------
+ # ---------------------------------
+ "ernie_linear_p7": [
+ "paddlespeech.text.models:ErnieLinear",
+ "paddlenlp.transformers:ErnieTokenizer"
+ ],
+ "ernie_linear_p3": [
+ "paddlespeech.text.models:ErnieLinear",
+ "paddlenlp.transformers:ErnieTokenizer"
+ ],
+
+ # ---------------------------------
+ # -------------- TTS --------------
+ # ---------------------------------
+ # acoustic model
+ "speedyspeech": ["paddlespeech.t2s.models.speedyspeech:SpeedySpeech"],
+ "speedyspeech_inference":
+ ["paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference"],
+ "fastspeech2": ["paddlespeech.t2s.models.fastspeech2:FastSpeech2"],
+ "fastspeech2_inference":
+ ["paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference"],
+ "tacotron2": ["paddlespeech.t2s.models.tacotron2:Tacotron2"],
+ "tacotron2_inference":
+ ["paddlespeech.t2s.models.tacotron2:Tacotron2Inference"],
+ # voc
+ "pwgan": ["paddlespeech.t2s.models.parallel_wavegan:PWGGenerator"],
+ "pwgan_inference":
+ ["paddlespeech.t2s.models.parallel_wavegan:PWGInference"],
+ "mb_melgan": ["paddlespeech.t2s.models.melgan:MelGANGenerator"],
+ "mb_melgan_inference": ["paddlespeech.t2s.models.melgan:MelGANInference"],
+ "style_melgan": ["paddlespeech.t2s.models.melgan:StyleMelGANGenerator"],
+ "style_melgan_inference":
+ ["paddlespeech.t2s.models.melgan:StyleMelGANInference"],
+ "hifigan": ["paddlespeech.t2s.models.hifigan:HiFiGANGenerator"],
+ "hifigan_inference": ["paddlespeech.t2s.models.hifigan:HiFiGANInference"],
+ "wavernn": ["paddlespeech.t2s.models.wavernn:WaveRNN"],
+ "wavernn_inference": ["paddlespeech.t2s.models.wavernn:WaveRNNInference"],
+
+ # ---------------------------------
+ # ------------ Vector -------------
+ # ---------------------------------
+ "ecapatdnn": ["paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn"],
+}
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..84362f967b5c473ea28bf25f4cbbe8900ec11072
--- /dev/null
+++ b/paddlespeech/resource/pretrained_models.py
@@ -0,0 +1,838 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+ 'asr_dynamic_pretrained_models',
+ 'asr_static_pretrained_models',
+ 'cls_dynamic_pretrained_models',
+ 'cls_static_pretrained_models',
+ 'st_dynamic_pretrained_models',
+ 'st_kaldi_bins',
+ 'text_dynamic_pretrained_models',
+ 'tts_dynamic_pretrained_models',
+ 'tts_static_pretrained_models',
+ 'tts_onnx_pretrained_models',
+ 'vector_dynamic_pretrained_models',
+]
+
+# The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+# e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
+# Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+# "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+
+# ---------------------------------
+# -------------- ASR --------------
+# ---------------------------------
+asr_dynamic_pretrained_models = {
+ "conformer_wenetspeech-zh-16k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
+ 'md5':
+ '76cb19ed857e6623856b7cd7ebbfeda4',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/conformer/checkpoints/wenetspeech',
+ },
+ },
+ "conformer_online_wenetspeech-zh-16k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz',
+ 'md5':
+ 'b8c02632b04da34aca88459835be54a6',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/chunk_conformer/checkpoints/avg_10',
+ 'model':
+ 'exp/chunk_conformer/checkpoints/avg_10.pdparams',
+ 'params':
+ 'exp/chunk_conformer/checkpoints/avg_10.pdparams',
+ 'lm_url':
+ '',
+ 'lm_md5':
+ '',
+ },
+ },
+ "conformer_online_multicn-zh-16k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.0.model.tar.gz',
+ 'md5':
+ '7989b3248c898070904cf042fd656003',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/chunk_conformer/checkpoints/multi_cn',
+ },
+ '2.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
+ 'md5':
+ '0ac93d390552336f2a906aec9e33c5fa',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/chunk_conformer/checkpoints/multi_cn',
+ 'model':
+ 'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
+ 'params':
+ 'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
+ 'lm_url':
+ 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+ 'lm_md5':
+ '29e02312deb2e59b3c8686c7966d4fe3',
+ },
+ },
+ "conformer_aishell-zh-16k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz',
+ 'md5':
+ '3f073eccfa7bb14e0c6867d65fc0dc3a',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/conformer/checkpoints/avg_30',
+ },
+ },
+ "conformer_online_aishell-zh-16k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz',
+ 'md5':
+ 'b374cfb93537761270b6224fb0bfc26a',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/chunk_conformer/checkpoints/avg_30',
+ },
+ },
+ "transformer_librispeech-en-16k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
+ 'md5':
+ '2c667da24922aad391eacafe37bc1660',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/transformer/checkpoints/avg_10',
+ },
+ },
+ "deepspeech2online_wenetspeech-zh-16k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz',
+ 'md5':
+ 'e393d4d274af0f6967db24fc146e8074',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/deepspeech2_online/checkpoints/avg_10',
+ 'lm_url':
+ 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+ 'lm_md5':
+ '29e02312deb2e59b3c8686c7966d4fe3'
+ },
+ },
+ "deepspeech2offline_aishell-zh-16k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+ 'md5':
+ '932c3593d62fe5c741b59b31318aa314',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/deepspeech2/checkpoints/avg_1',
+ 'lm_url':
+ 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+ 'lm_md5':
+ '29e02312deb2e59b3c8686c7966d4fe3'
+ },
+ },
+ "deepspeech2online_aishell-zh-16k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
+ 'md5':
+ '98b87b171b7240b7cae6e07d8d0bc9be',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/deepspeech2_online/checkpoints/avg_1',
+ 'model':
+ 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
+ 'params':
+ 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
+ 'lm_url':
+ 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+ 'lm_md5':
+ '29e02312deb2e59b3c8686c7966d4fe3'
+ },
+ },
+ "deepspeech2offline_librispeech-en-16k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
+ 'md5':
+ 'f5666c81ad015c8de03aac2bc92e5762',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/deepspeech2/checkpoints/avg_1',
+ 'lm_url':
+ 'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
+ 'lm_md5':
+ '099a601759d467cd0a8523ff939819c5'
+ },
+ },
+}
+
+asr_static_pretrained_models = {
+ "deepspeech2offline_aishell-zh-16k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+ 'md5':
+ '932c3593d62fe5c741b59b31318aa314',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/deepspeech2/checkpoints/avg_1',
+ 'model':
+ 'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
+ 'params':
+ 'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
+ 'lm_url':
+ 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+ 'lm_md5':
+ '29e02312deb2e59b3c8686c7966d4fe3'
+ }
+ },
+}
+
+# ---------------------------------
+# -------------- CLS --------------
+# ---------------------------------
+cls_dynamic_pretrained_models = {
+ "panns_cnn6-32k": {
+ '1.0': {
+ 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
+ 'md5': '4cf09194a95df024fd12f84712cf0f9c',
+ 'cfg_path': 'panns.yaml',
+ 'ckpt_path': 'cnn6.pdparams',
+ 'label_file': 'audioset_labels.txt',
+ },
+ },
+ "panns_cnn10-32k": {
+ '1.0': {
+ 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
+ 'md5': 'cb8427b22176cc2116367d14847f5413',
+ 'cfg_path': 'panns.yaml',
+ 'ckpt_path': 'cnn10.pdparams',
+ 'label_file': 'audioset_labels.txt',
+ },
+ },
+ "panns_cnn14-32k": {
+ '1.0': {
+ 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
+ 'md5': 'e3b9b5614a1595001161d0ab95edee97',
+ 'cfg_path': 'panns.yaml',
+ 'ckpt_path': 'cnn14.pdparams',
+ 'label_file': 'audioset_labels.txt',
+ },
+ },
+}
+
+cls_static_pretrained_models = {
+ "panns_cnn6-32k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
+ 'md5':
+ 'da087c31046d23281d8ec5188c1967da',
+ 'cfg_path':
+ 'panns.yaml',
+ 'model_path':
+ 'inference.pdmodel',
+ 'params_path':
+ 'inference.pdiparams',
+ 'label_file':
+ 'audioset_labels.txt',
+ },
+ },
+ "panns_cnn10-32k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
+ 'md5':
+ '5460cc6eafbfaf0f261cc75b90284ae1',
+ 'cfg_path':
+ 'panns.yaml',
+ 'model_path':
+ 'inference.pdmodel',
+ 'params_path':
+ 'inference.pdiparams',
+ 'label_file':
+ 'audioset_labels.txt',
+ },
+ },
+ "panns_cnn14-32k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
+ 'md5':
+ 'ccc80b194821274da79466862b2ab00f',
+ 'cfg_path':
+ 'panns.yaml',
+ 'model_path':
+ 'inference.pdmodel',
+ 'params_path':
+ 'inference.pdiparams',
+ 'label_file':
+ 'audioset_labels.txt',
+ },
+ },
+}
+
+# ---------------------------------
+# -------------- ST ---------------
+# ---------------------------------
+st_dynamic_pretrained_models = {
+ "fat_st_ted-en-zh": {
+ '1.0': {
+ "url":
+ "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz",
+ "md5":
+ "d62063f35a16d91210a71081bd2dd557",
+ "cfg_path":
+ "model.yaml",
+ "ckpt_path":
+ "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
+ },
+ },
+}
+
+st_kaldi_bins = {
+ "url":
+ "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
+ "md5":
+ "c0682303b3f3393dbf6ed4c4e35a53eb",
+}
+
+# ---------------------------------
+# -------------- TEXT -------------
+# ---------------------------------
+text_dynamic_pretrained_models = {
+ "ernie_linear_p7_wudao-punc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz',
+ 'md5':
+ '12283e2ddde1797c5d1e57036b512746',
+ 'cfg_path':
+ 'ckpt/model_config.json',
+ 'ckpt_path':
+ 'ckpt/model_state.pdparams',
+ 'vocab_file':
+ 'punc_vocab.txt',
+ },
+ },
+ "ernie_linear_p3_wudao-punc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz',
+ 'md5':
+ '448eb2fdf85b6a997e7e652e80c51dd2',
+ 'cfg_path':
+ 'ckpt/model_config.json',
+ 'ckpt_path':
+ 'ckpt/model_state.pdparams',
+ 'vocab_file':
+ 'punc_vocab.txt',
+ },
+ },
+}
+
+# ---------------------------------
+# -------------- TTS --------------
+# ---------------------------------
+tts_dynamic_pretrained_models = {
+ # speedyspeech
+ "speedyspeech_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
+ 'md5':
+ '6f6fa967b408454b6662c8c00c0027cb',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_30600.pdz',
+ 'speech_stats':
+ 'feats_stats.npy',
+ 'phones_dict':
+ 'phone_id_map.txt',
+ 'tones_dict':
+ 'tone_id_map.txt',
+ },
+ },
+ # fastspeech2
+ "fastspeech2_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
+ 'md5':
+ '637d28a5e53aa60275612ba4393d5f22',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_76000.pdz',
+ 'speech_stats':
+ 'speech_stats.npy',
+ 'phones_dict':
+ 'phone_id_map.txt',
+ },
+ },
+ "fastspeech2_ljspeech-en": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip',
+ 'md5':
+ 'ffed800c93deaf16ca9b3af89bfcd747',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_100000.pdz',
+ 'speech_stats':
+ 'speech_stats.npy',
+ 'phones_dict':
+ 'phone_id_map.txt',
+ },
+ },
+ "fastspeech2_aishell3-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip',
+ 'md5':
+ 'f4dd4a5f49a4552b77981f544ab3392e',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_96400.pdz',
+ 'speech_stats':
+ 'speech_stats.npy',
+ 'phones_dict':
+ 'phone_id_map.txt',
+ 'speaker_dict':
+ 'speaker_id_map.txt',
+ },
+ },
+ "fastspeech2_vctk-en": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip',
+ 'md5':
+ '743e5024ca1e17a88c5c271db9779ba4',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_66200.pdz',
+ 'speech_stats':
+ 'speech_stats.npy',
+ 'phones_dict':
+ 'phone_id_map.txt',
+ 'speaker_dict':
+ 'speaker_id_map.txt',
+ },
+ },
+ # tacotron2
+ "tacotron2_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip',
+ 'md5':
+ '0df4b6f0bcbe0d73c5ed6df8867ab91a',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_30600.pdz',
+ 'speech_stats':
+ 'speech_stats.npy',
+ 'phones_dict':
+ 'phone_id_map.txt',
+ },
+ },
+ "tacotron2_ljspeech-en": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip',
+ 'md5':
+ '6a5eddd81ae0e81d16959b97481135f3',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_60300.pdz',
+ 'speech_stats':
+ 'speech_stats.npy',
+ 'phones_dict':
+ 'phone_id_map.txt',
+ },
+ },
+ # pwgan
+ "pwgan_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip',
+ 'md5':
+ '2e481633325b5bdf0a3823c714d2c117',
+ 'config':
+ 'pwg_default.yaml',
+ 'ckpt':
+ 'pwg_snapshot_iter_400000.pdz',
+ 'speech_stats':
+ 'pwg_stats.npy',
+ },
+ },
+ "pwgan_ljspeech-en": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip',
+ 'md5':
+ '53610ba9708fd3008ccaf8e99dacbaf0',
+ 'config':
+ 'pwg_default.yaml',
+ 'ckpt':
+ 'pwg_snapshot_iter_400000.pdz',
+ 'speech_stats':
+ 'pwg_stats.npy',
+ },
+ },
+ "pwgan_aishell3-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip',
+ 'md5':
+ 'd7598fa41ad362d62f85ffc0f07e3d84',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_1000000.pdz',
+ 'speech_stats':
+ 'feats_stats.npy',
+ },
+ },
+ "pwgan_vctk-en": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip',
+ 'md5':
+ 'b3da1defcde3e578be71eb284cb89f2c',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_1500000.pdz',
+ 'speech_stats':
+ 'feats_stats.npy',
+ },
+ },
+ # mb_melgan
+ "mb_melgan_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
+ 'md5':
+ 'ee5f0604e20091f0d495b6ec4618b90d',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_1000000.pdz',
+ 'speech_stats':
+ 'feats_stats.npy',
+ },
+ },
+ # style_melgan
+ "style_melgan_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip',
+ 'md5':
+ '5de2d5348f396de0c966926b8c462755',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_1500000.pdz',
+ 'speech_stats':
+ 'feats_stats.npy',
+ },
+ },
+ # hifigan
+ "hifigan_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
+ 'md5':
+ 'dd40a3d88dfcf64513fba2f0f961ada6',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_2500000.pdz',
+ 'speech_stats':
+ 'feats_stats.npy',
+ },
+ },
+ "hifigan_ljspeech-en": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
+ 'md5':
+ '70e9131695decbca06a65fe51ed38a72',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_2500000.pdz',
+ 'speech_stats':
+ 'feats_stats.npy',
+ },
+ },
+ "hifigan_aishell3-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
+ 'md5':
+ '3bb49bc75032ed12f79c00c8cc79a09a',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_2500000.pdz',
+ 'speech_stats':
+ 'feats_stats.npy',
+ },
+ },
+ "hifigan_vctk-en": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
+ 'md5':
+ '7da8f88359bca2457e705d924cf27bd4',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_2500000.pdz',
+ 'speech_stats':
+ 'feats_stats.npy',
+ },
+ },
+ # wavernn
+ "wavernn_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip',
+ 'md5':
+ 'ee37b752f09bcba8f2af3b777ca38e13',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_400000.pdz',
+ 'speech_stats':
+ 'feats_stats.npy',
+ },
+ },
+ "fastspeech2_cnndecoder_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip',
+ 'md5':
+ '6eb28e22ace73e0ebe7845f86478f89f',
+ 'config':
+ 'cnndecoder.yaml',
+ 'ckpt':
+ 'snapshot_iter_153000.pdz',
+ 'speech_stats':
+ 'speech_stats.npy',
+ 'phones_dict':
+ 'phone_id_map.txt',
+ },
+ },
+}
+
+tts_static_pretrained_models = {
+ # speedyspeech
+ "speedyspeech_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip',
+ 'md5':
+ 'f10cbdedf47dc7a9668d2264494e1823',
+ 'model':
+ 'speedyspeech_csmsc.pdmodel',
+ 'params':
+ 'speedyspeech_csmsc.pdiparams',
+ 'phones_dict':
+ 'phone_id_map.txt',
+ 'tones_dict':
+ 'tone_id_map.txt',
+ 'sample_rate':
+ 24000,
+ },
+ },
+ # fastspeech2
+ "fastspeech2_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip',
+ 'md5':
+ '9788cd9745e14c7a5d12d32670b2a5a7',
+ 'model':
+ 'fastspeech2_csmsc.pdmodel',
+ 'params':
+ 'fastspeech2_csmsc.pdiparams',
+ 'phones_dict':
+ 'phone_id_map.txt',
+ 'sample_rate':
+ 24000,
+ },
+ },
+ # pwgan
+ "pwgan_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip',
+ 'md5':
+ 'e3504aed9c5a290be12d1347836d2742',
+ 'model':
+ 'pwgan_csmsc.pdmodel',
+ 'params':
+ 'pwgan_csmsc.pdiparams',
+ 'sample_rate':
+ 24000,
+ },
+ },
+ # mb_melgan
+ "mb_melgan_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip',
+ 'md5':
+ 'ac6eee94ba483421d750433f4c3b8d36',
+ 'model':
+ 'mb_melgan_csmsc.pdmodel',
+ 'params':
+ 'mb_melgan_csmsc.pdiparams',
+ 'sample_rate':
+ 24000,
+ },
+ },
+ # hifigan
+ "hifigan_csmsc-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip',
+ 'md5':
+ '7edd8c436b3a5546b3a7cb8cff9d5a0c',
+ 'model':
+ 'hifigan_csmsc.pdmodel',
+ 'params':
+ 'hifigan_csmsc.pdiparams',
+ 'sample_rate':
+ 24000,
+ },
+ },
+}
+
+tts_onnx_pretrained_models = {
+ # fastspeech2
+ "fastspeech2_csmsc_onnx-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
+ 'md5':
+ 'fd3ad38d83273ad51f0ea4f4abf3ab4e',
+ 'ckpt': ['fastspeech2_csmsc.onnx'],
+ 'phones_dict':
+ 'phone_id_map.txt',
+ 'sample_rate':
+ 24000,
+ },
+ },
+ "fastspeech2_cnndecoder_csmsc_onnx-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
+ 'md5':
+ '5f70e1a6bcd29d72d54e7931aa86f266',
+ 'ckpt': [
+ 'fastspeech2_csmsc_am_encoder_infer.onnx',
+ 'fastspeech2_csmsc_am_decoder.onnx',
+ 'fastspeech2_csmsc_am_postnet.onnx',
+ ],
+ 'speech_stats':
+ 'speech_stats.npy',
+ 'phones_dict':
+ 'phone_id_map.txt',
+ 'sample_rate':
+ 24000,
+ },
+ },
+ # mb_melgan
+ "mb_melgan_csmsc_onnx-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
+ 'md5':
+ '5b83ec746e8414bc29032d954ffd07ec',
+ 'ckpt':
+ 'mb_melgan_csmsc.onnx',
+ 'sample_rate':
+ 24000,
+ },
+ },
+ # hifigan
+ "hifigan_csmsc_onnx-zh": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
+ 'md5':
+ '1a7dc0385875889e46952e50c0994a6b',
+ 'ckpt':
+ 'hifigan_csmsc.onnx',
+ 'sample_rate':
+ 24000,
+ },
+ },
+}
+
+# ---------------------------------
+# ------------ Vector -------------
+# ---------------------------------
+vector_dynamic_pretrained_models = {
+ "ecapatdnn_voxceleb12-16k": {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz',
+ 'md5':
+ 'cc33023c54ab346cd318408f43fcaf95',
+ 'cfg_path':
+ 'conf/model.yaml', # the yaml config path
+ 'ckpt_path':
+ 'model/model', # the format is ${dir}/{model_name},
+ # so the first 'model' is dir, the second 'model' is the name
+ # this means we have a model stored as model/model.pdparams
+ },
+ },
+}
diff --git a/paddlespeech/resource/resource.py b/paddlespeech/resource/resource.py
new file mode 100644
index 0000000000000000000000000000000000000000..fff5f745dd9242f08f6ec6116cd84b167a04bddb
--- /dev/null
+++ b/paddlespeech/resource/resource.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from collections import OrderedDict
+from typing import Dict
+from typing import List
+from typing import Optional
+
+from ..cli.utils import download_and_decompress
+from ..cli.utils import MODEL_HOME
+from ..utils.dynamic_import import dynamic_import
+from .model_alias import model_alias
+
+task_supported = ['asr', 'cls', 'st', 'text', 'tts', 'vector']
+model_format_supported = ['dynamic', 'static', 'onnx']
+inference_mode_supported = ['online', 'offline']
+
+
+class CommonTaskResource:
+ def __init__(self, task: str, model_format: str='dynamic', **kwargs):
+ assert task in task_supported, 'Arg "task" must be one of {}.'.format(
+ task_supported)
+ assert model_format in model_format_supported, 'Arg "model_format" must be one of {}.'.format(
+ model_format_supported)
+
+ self.task = task
+ self.model_format = model_format
+ self.pretrained_models = self._get_pretrained_models()
+
+ if 'inference_mode' in kwargs:
+ assert kwargs[
+ 'inference_mode'] in inference_mode_supported, 'Arg "inference_mode" must be one of {}.'.format(
+ inference_mode_supported)
+ self._inference_mode_filter(kwargs['inference_mode'])
+
+ # Initialize after model and version had been set.
+ self.model_tag = None
+ self.version = None
+ self.res_dict = None
+ self.res_dir = None
+
+ if self.task == 'tts':
+ # For vocoder
+ self.voc_model_tag = None
+ self.voc_version = None
+ self.voc_res_dict = None
+ self.voc_res_dir = None
+
+ def set_task_model(self,
+ model_tag: str,
+ model_type: int=0,
+ version: Optional[str]=None):
+ """Set model tag and version of current task.
+
+ Args:
+ model_tag (str): Model tag.
+ model_type (int): 0 for acoustic model otherwise vocoder in tts task.
+ version (Optional[str], optional): Version of pretrained model. Defaults to None.
+ """
+ assert model_tag in self.pretrained_models, \
+ "Can't find \"{}\" in resource. Model name must be one of {}".format(model_tag, list(self.pretrained_models.keys()))
+
+ if version is None:
+ version = self._get_default_version(model_tag)
+
+ assert version in self.pretrained_models[model_tag], \
+ "Can't find version \"{}\" in \"{}\". Model name must be one of {}".format(
+ version, model_tag, list(self.pretrained_models[model_tag].keys()))
+
+ if model_type == 0:
+ self.model_tag = model_tag
+ self.version = version
+ self.res_dict = self.pretrained_models[model_tag][version]
+ self.format_path(self.res_dict)
+ self.res_dir = self._fetch(self.res_dict,
+ self._get_model_dir(model_type))
+ else:
+ assert self.task == 'tts', 'Vocoder will only be used in tts task.'
+ self.voc_model_tag = model_tag
+ self.voc_version = version
+ self.voc_res_dict = self.pretrained_models[model_tag][version]
+ self.format_path(self.voc_res_dict)
+ self.voc_res_dir = self._fetch(self.voc_res_dict,
+ self._get_model_dir(model_type))
+
+ @staticmethod
+ def format_path(res_dict: Dict[str, str]):
+ for k, v in res_dict.items():
+ if '/' in v:
+ if v.startswith('https://') or v.startswith('http://'):
+ continue
+ else:
+ res_dict[k] = os.path.join(*(v.split('/')))
+
+ @staticmethod
+ def get_model_class(model_name) -> List[object]:
+ """Dynamic import model class.
+ Args:
+ model_name (str): Model name.
+
+ Returns:
+ List[object]: Return a list of model class.
+ """
+ assert model_name in model_alias, 'No model classes found for "{}"'.format(
+ model_name)
+
+ ret = []
+ for import_path in model_alias[model_name]:
+ ret.append(dynamic_import(import_path))
+
+ if len(ret) == 1:
+ return ret[0]
+ else:
+ return ret
+
+ def get_versions(self, model_tag: str) -> List[str]:
+ """List all available versions.
+
+ Args:
+ model_tag (str): Model tag.
+
+ Returns:
+ List[str]: Version list of model.
+ """
+ return list(self.pretrained_models[model_tag].keys())
+
+ def _get_default_version(self, model_tag: str) -> str:
+ """Get default version of model.
+
+ Args:
+ model_tag (str): Model tag.
+
+ Returns:
+ str: Default version.
+ """
+ return self.get_versions(model_tag)[-1] # get latest version
+
+ def _get_model_dir(self, model_type: int=0) -> os.PathLike:
+ """Get resource directory.
+
+ Args:
+ model_type (int): 0 for acoustic model otherwise vocoder in tts task.
+
+ Returns:
+ os.PathLike: Directory of model resource.
+ """
+ if model_type == 0:
+ model_tag = self.model_tag
+ version = self.version
+ else:
+ model_tag = self.voc_model_tag
+ version = self.voc_version
+
+ return os.path.join(MODEL_HOME, model_tag, version)
+
+ def _get_pretrained_models(self) -> Dict[str, str]:
+ """Get all available models for current task.
+
+ Returns:
+ Dict[str, str]: A dictionary with model tag and resources info.
+ """
+ try:
+ import_models = '{}_{}_pretrained_models'.format(self.task,
+ self.model_format)
+ exec('from .pretrained_models import {}'.format(import_models))
+ models = OrderedDict(locals()[import_models])
+ except ImportError:
+ models = OrderedDict({}) # no models.
+ finally:
+ return models
+
+ def _inference_mode_filter(self, inference_mode: Optional[str]):
+ """Filter models dict based on inference_mode.
+
+ Args:
+ inference_mode (Optional[str]): 'online', 'offline' or None.
+ """
+ if inference_mode is None:
+ return
+
+ if self.task == 'asr':
+ online_flags = [
+ 'online' in model_tag
+ for model_tag in self.pretrained_models.keys()
+ ]
+ for online_flag, model_tag in zip(
+ online_flags, list(self.pretrained_models.keys())):
+ if inference_mode == 'online' and online_flag:
+ continue
+ elif inference_mode == 'offline' and not online_flag:
+ continue
+ else:
+ del self.pretrained_models[model_tag]
+ elif self.task == 'tts':
+ # Hardcode for tts online models.
+ tts_online_models = [
+ 'fastspeech2_csmsc-zh', 'fastspeech2_cnndecoder_csmsc-zh',
+ 'mb_melgan_csmsc-zh', 'hifigan_csmsc-zh'
+ ]
+ for model_tag in list(self.pretrained_models.keys()):
+ if inference_mode == 'online' and model_tag in tts_online_models:
+ continue
+ elif inference_mode == 'offline':
+ continue
+ else:
+ del self.pretrained_models[model_tag]
+ else:
+ raise NotImplementedError('Only supports asr and tts task.')
+
+ @staticmethod
+ def _fetch(res_dict: Dict[str, str],
+ target_dir: os.PathLike) -> os.PathLike:
+ """Fetch archive from url.
+
+ Args:
+ res_dict (Dict[str, str]): Info dict of a resource.
+ target_dir (os.PathLike): Directory to save archives.
+
+ Returns:
+ os.PathLike: Directory of model resource.
+ """
+ return download_and_decompress(res_dict, target_dir)
diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 2365071f31fbb51b0c8dec6950a8eb7521d92693..2da68435c27fac821a7daed05c03a78b8914eed3 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -189,25 +189,6 @@ if not hasattr(paddle.Tensor, 'contiguous'):
paddle.static.Variable.contiguous = contiguous
-def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
- nargs = len(args)
- assert (nargs <= 1)
- s = paddle.shape(xs)
- if nargs == 1:
- return s[args[0]]
- else:
- return s
-
-
-#`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
-logger.debug(
- "override size of paddle.Tensor "
- "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
-)
-paddle.Tensor.size = size
-paddle.static.Variable.size = size
-
-
def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
return xs.reshape(args)
@@ -219,7 +200,7 @@ if not hasattr(paddle.Tensor, 'view'):
def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
- return xs.reshape(ys.size())
+ return xs.reshape(paddle.shape(ys))
if not hasattr(paddle.Tensor, 'view_as'):
diff --git a/paddlespeech/s2t/decoders/beam_search/beam_search.py b/paddlespeech/s2t/decoders/beam_search/beam_search.py
index f331cb1c93e1331aa25600e6b5b819212ed6f096..f6a2b4b0a422ea7ee5a31745bf603866db0528f4 100644
--- a/paddlespeech/s2t/decoders/beam_search/beam_search.py
+++ b/paddlespeech/s2t/decoders/beam_search/beam_search.py
@@ -194,7 +194,7 @@ class BeamSearch(paddle.nn.Layer):
Args:
hyp (Hypothesis): Hypothesis with prefix tokens to score
- ids (paddle.Tensor): 1D tensor of new partial tokens to score,
+ ids (paddle.Tensor): 1D tensor of new partial tokens to score,
len(ids) < n_vocab
x (paddle.Tensor): Corresponding input feature, (T, D)
@@ -224,14 +224,14 @@ class BeamSearch(paddle.nn.Layer):
ids (paddle.Tensor): The partial token ids(Global) to compute topk.
Returns:
- Tuple[paddle.Tensor, paddle.Tensor]:
+ Tuple[paddle.Tensor, paddle.Tensor]:
The topk full token ids and partial token ids.
Their shapes are `(self.beam_size,)`.
i.e. (global ids, global relative local ids).
"""
# no pre beam performed, `ids` equal to `weighted_scores`
- if weighted_scores.size(0) == ids.size(0):
+ if paddle.shape(weighted_scores)[0] == paddle.shape(ids)[0]:
top_ids = weighted_scores.topk(
self.beam_size)[1] # index in n_vocab
return top_ids, top_ids
@@ -370,13 +370,13 @@ class BeamSearch(paddle.nn.Layer):
"""
# set length bounds
if maxlenratio == 0:
- maxlen = x.shape[0]
+ maxlen = paddle.shape(x)[0]
elif maxlenratio < 0:
maxlen = -1 * int(maxlenratio)
else:
- maxlen = max(1, int(maxlenratio * x.size(0)))
- minlen = int(minlenratio * x.size(0))
- logger.info("decoder input length: " + str(x.shape[0]))
+ maxlen = max(1, int(maxlenratio * paddle.shape(x)[0]))
+ minlen = int(minlenratio * paddle.shape(x)[0])
+ logger.info("decoder input length: " + str(paddle.shape(x)[0]))
logger.info("max output length: " + str(maxlen))
logger.info("min output length: " + str(minlen))
diff --git a/paddlespeech/s2t/decoders/scorers/ctc.py b/paddlespeech/s2t/decoders/scorers/ctc.py
index 81d8b078392eb0282d59cfbefbb72a2583647aae..3c1d4cf8076d2af5399e636aa4ab7f894aeaae5c 100644
--- a/paddlespeech/s2t/decoders/scorers/ctc.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc.py
@@ -69,7 +69,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface):
return sc[i], st[i]
else: # for CTCPrefixScorePD (need new_id > 0)
r, log_psi, f_min, f_max, scoring_idmap = state
- s = log_psi[i, new_id].expand(log_psi.size(1))
+ s = log_psi[i, new_id].expand(paddle.shape(log_psi)[1])
if scoring_idmap is not None:
return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max
else:
@@ -107,7 +107,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface):
"""
logp = self.ctc.log_softmax(x.unsqueeze(0)) # assuming batch_size = 1
- xlen = paddle.to_tensor([logp.size(1)])
+ xlen = paddle.to_tensor([paddle.shape(logp)[1]])
self.impl = CTCPrefixScorePD(logp, xlen, 0, self.eos)
return None
diff --git a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
index 78b8fe36c8c0383d642740cab252ba7c89ba2ec0..d8ca5ccde6842274fd2eea6f3a34c36f404ae717 100644
--- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
@@ -33,9 +33,9 @@ class CTCPrefixScorePD():
self.logzero = -10000000000.0
self.blank = blank
self.eos = eos
- self.batch = x.size(0)
- self.input_length = x.size(1)
- self.odim = x.size(2)
+ self.batch = paddle.shape(x)[0]
+ self.input_length = paddle.shape(x)[1]
+ self.odim = paddle.shape(x)[2]
self.dtype = x.dtype
# Pad the rest of posteriors in the batch
@@ -76,8 +76,7 @@ class CTCPrefixScorePD():
last_ids = [yi[-1] for yi in y] # last output label ids
n_bh = len(last_ids) # batch * hyps
n_hyps = n_bh // self.batch # assuming each utterance has the same # of hyps
- self.scoring_num = scoring_ids.size(
- -1) if scoring_ids is not None else 0
+ self.scoring_num = paddle.shape(scoring_ids)[-1] if scoring_ids is not None else 0
# prepare state info
if state is None:
r_prev = paddle.full(
@@ -153,7 +152,7 @@ class CTCPrefixScorePD():
# compute forward probabilities log(r_t^n(h)) and log(r_t^b(h))
for t in range(start, end):
- rp = r[t - 1] # (2 x BW x O')
+ rp = r[t - 1] # (2 x BW x O')
rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view(
2, 2, n_bh, snum) # (2,2,BW,O')
r[t] = paddle.logsumexp(rr, 1) + x_[:, t]
@@ -227,7 +226,7 @@ class CTCPrefixScorePD():
if self.x.shape[1] < x.shape[1]: # self.x (2,T,B,O); x (B,T,O)
# Pad the rest of posteriors in the batch
# TODO(takaaki-hori): need a better way without for-loops
- xlens = [x.size(1)]
+ xlens = [paddle.shape(x)[1]]
for i, l in enumerate(xlens):
if l < self.input_length:
x[i, l:, :] = self.logzero
@@ -237,7 +236,7 @@ class CTCPrefixScorePD():
xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
self.x = paddle.stack([xn, xb]) # (2, T, B, O)
self.x[:, :tmp_x.shape[1], :, :] = tmp_x
- self.input_length = x.size(1)
+ self.input_length = paddle.shape(x)[1]
self.end_frames = paddle.to_tensor(xlens) - 1
def extend_state(self, state):
@@ -318,16 +317,16 @@ class CTCPrefixScore():
r[0, 0] = xs[0]
r[0, 1] = self.logzero
else:
- # Although the code does not exactly follow Algorithm 2,
- # we don't have to change it because we can assume
- # r_t(h)=0 for t < |h| in CTC forward computation
+ # Although the code does not exactly follow Algorithm 2,
+ # we don't have to change it because we can assume
+ # r_t(h)=0 for t < |h| in CTC forward computation
# (Note: we assume here that index t starts with 0).
# The purpose of this difference is to reduce the number of for-loops.
# https://github.com/espnet/espnet/pull/3655
- # where we start to accumulate r_t(h) from t=|h|
- # and iterate r_t(h) = (r_{t-1}(h) + ...) to T-1,
+ # where we start to accumulate r_t(h) from t=|h|
+ # and iterate r_t(h) = (r_{t-1}(h) + ...) to T-1,
# avoiding accumulating zeros for t=1~|h|-1.
- # Thus, we need to set r_{|h|-1}(h) = 0,
+ # Thus, we need to set r_{|h|-1}(h) = 0,
# i.e., r[output_length-1] = logzero, for initialization.
# This is just for reducing the computation.
r[output_length - 1] = self.logzero
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index 0e94f047bce7ad053ecd566f4a8d8c83a1b10a7c..9987b511051a302ab76ca52440c0260a6981af5a 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/models/ds2/__init__.py b/paddlespeech/s2t/models/ds2/__init__.py
index b32220673e610ea2ba1e907011c92708c2797fb3..0a5c50d860f0f1432f0c1889edba8673b813bfb7 100644
--- a/paddlespeech/s2t/models/ds2/__init__.py
+++ b/paddlespeech/s2t/models/ds2/__init__.py
@@ -14,13 +14,15 @@
from .deepspeech2 import DeepSpeech2InferModel
from .deepspeech2 import DeepSpeech2Model
from paddlespeech.s2t.utils import dynamic_pip_install
+import sys
try:
import paddlespeech_ctcdecoders
except ImportError:
try:
package_name = 'paddlespeech_ctcdecoders'
- dynamic_pip_install.install(package_name)
+ if sys.platform != "win32":
+ dynamic_pip_install.install(package_name)
except Exception:
raise RuntimeError(
"Can not install package paddlespeech_ctcdecoders on your system. \
diff --git a/paddlespeech/s2t/models/ds2_online/__init__.py b/paddlespeech/s2t/models/ds2_online/__init__.py
index c5fdab1bc66aff815ed217d99703c1ff3493975a..de772b6457be185f1995c1ca7498dd50c78ae766 100644
--- a/paddlespeech/s2t/models/ds2_online/__init__.py
+++ b/paddlespeech/s2t/models/ds2_online/__init__.py
@@ -14,13 +14,15 @@
from .deepspeech2 import DeepSpeech2InferModelOnline
from .deepspeech2 import DeepSpeech2ModelOnline
from paddlespeech.s2t.utils import dynamic_pip_install
+import sys
try:
import paddlespeech_ctcdecoders
except ImportError:
try:
package_name = 'paddlespeech_ctcdecoders'
- dynamic_pip_install.install(package_name)
+ if sys.platform != "win32":
+ dynamic_pip_install.install(package_name)
except Exception:
raise RuntimeError(
"Can not install package paddlespeech_ctcdecoders on your system. \
diff --git a/paddlespeech/s2t/models/lm/transformer.py b/paddlespeech/s2t/models/lm/transformer.py
index 85bd7c2329fbf416d254bd9eabcaaf181fe7db01..d14f99563fcfc7ff8a73e131119fea96ea1d7d07 100644
--- a/paddlespeech/s2t/models/lm/transformer.py
+++ b/paddlespeech/s2t/models/lm/transformer.py
@@ -90,7 +90,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
def _target_mask(self, ys_in_pad):
ys_mask = ys_in_pad != 0
- m = subsequent_mask(ys_mask.size(-1)).unsqueeze(0)
+ m = subsequent_mask(paddle.shape(ys_mask)[-1])).unsqueeze(0)
return ys_mask.unsqueeze(-2) & m
def forward(self, x: paddle.Tensor, t: paddle.Tensor
@@ -112,7 +112,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
in perplexity: p(t)^{-n} = exp(-log p(t) / n)
"""
- batch_size = x.size(0)
+ batch_size = paddle.shape(x)[0]
xm = x != 0
xlen = xm.sum(axis=1)
if self.embed_drop is not None:
@@ -122,7 +122,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
h, _ = self.encoder(emb, xlen)
y = self.decoder(h)
loss = F.cross_entropy(
- y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
+ y.view(-1, paddle.shape(y)[-1]), t.view(-1), reduction="none")
mask = xm.to(loss.dtype)
logp = loss * mask.view(-1)
nll = logp.view(batch_size, -1).sum(-1)
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 530840d0f161dcd03ac6873f5217be9a90597150..b4b61666f24f0fe67ea85d92565916617d5d20b2 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -1,3 +1,4 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -775,7 +776,7 @@ class U2DecodeModel(U2BaseModel):
"""
self.eval()
x = paddle.to_tensor(x).unsqueeze(0)
- ilen = x.size(1)
+ ilen = paddle.shape(x)[1]
enc_output, _ = self._forward_encoder(x, ilen)
return enc_output.squeeze(0)
diff --git a/paddlespeech/s2t/models/u2/updater.py b/paddlespeech/s2t/models/u2/updater.py
index c59090a84ee4d416353eff3d6049ff3451cf0dae..898a50bf0a6c241f6b7cdcdbab78310731ead00a 100644
--- a/paddlespeech/s2t/models/u2/updater.py
+++ b/paddlespeech/s2t/models/u2/updater.py
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-# Modified from wenet(https://github.com/wenet-e2e/wenet)
+
from contextlib import nullcontext
import paddle
diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py
index 33ad472defba0a86bc945582f386acb406e4c35e..ca576eef1d2bfb44d52b6e2de54bd720983edf0b 100644
--- a/paddlespeech/s2t/modules/ctc.py
+++ b/paddlespeech/s2t/modules/ctc.py
@@ -22,6 +22,7 @@ from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.modules.loss import CTCLoss
from paddlespeech.s2t.utils import ctc_utils
from paddlespeech.s2t.utils.log import Log
+import sys
logger = Log(__name__).getlog()
@@ -34,7 +35,8 @@ except ImportError:
try:
from paddlespeech.s2t.utils import dynamic_pip_install
package_name = 'paddlespeech_ctcdecoders'
- dynamic_pip_install.install(package_name)
+ if sys.platform != "win32":
+ dynamic_pip_install.install(package_name)
from paddlespeech.s2t.decoders.ctcdecoder import ctc_beam_search_decoding_batch # noqa: F401
from paddlespeech.s2t.decoders.ctcdecoder import ctc_greedy_decoding # noqa: F401
from paddlespeech.s2t.decoders.ctcdecoder import Scorer # noqa: F401
diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py
index 42ac119b44540a1931408b1b86aa75e8b1413597..ccc8482d5372cd77bcae29bdc5b876e8620e2c77 100644
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@@ -242,7 +242,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
]
# batch decoding
- ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0) # (B,L,L)
+ ys_mask = subsequent_mask(paddle.shape(ys)[-1]).unsqueeze(0) # (B,L,L)
xs_mask = make_xs_mask(xs).unsqueeze(1) # (B,1,T)
logp, states = self.forward_one_step(
xs, xs_mask, ys, ys_mask, cache=batch_state)
diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py
index 596f61b78a4e449b2998b3544dd4204371aa8a2b..51e558eb8adbd801e0c1a0d563062e5a3a3f2104 100644
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
@@ -115,7 +115,7 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface):
assert offset + x.shape[
1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
offset, x.shape[1], self.max_len)
- #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor
+ #TODO(Hui Zhang): using T = paddle.shape(x)[1], __getitem__ not support Tensor
pos_emb = self.pe[:, offset:offset + T]
x = x * self.xscale + pos_emb
return self.dropout(x), self.dropout(pos_emb)
@@ -165,6 +165,6 @@ class RelPositionalEncoding(PositionalEncoding):
1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
offset, x.shape[1], self.max_len)
x = x * self.xscale
- #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor
+ #TODO(Hui Zhang): using paddle.shape(x)[1], __getitem__ not support Tensor
pos_emb = self.pe[:, offset:offset + x.shape[1]]
return self.dropout(x), self.dropout(pos_emb)
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index 669a12d656947f0446eba3d228832964e8c1d7b0..4d31acf1a73201cfb8f1e49b020c6635792ae638 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -218,7 +218,7 @@ class BaseEncoder(nn.Layer):
assert xs.shape[0] == 1 # batch size must be one
# tmp_masks is just for interface compatibility
# TODO(Hui Zhang): stride_slice not support bool tensor
- # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
+ # tmp_masks = paddle.ones([1, paddle.shape(xs)[1]], dtype=paddle.bool)
tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T]
diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py
index 9e41b824b6ed0261db5acb24fb5e0aff2a4758fa..b18caefb8b978d426a1b105cb1480067176cc6ff 100644
--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
@@ -154,7 +154,8 @@ class SpeedPerturbationSox():
package = "sox"
dynamic_pip_install.install(package)
package = "soxbindings"
- dynamic_pip_install.install(package)
+ if sys.platform != "win32":
+ dynamic_pip_install.install(package)
import soxbindings as sox
except Exception:
raise RuntimeError(
diff --git a/paddlespeech/s2t/utils/ctc_utils.py b/paddlespeech/s2t/utils/ctc_utils.py
index 886b72033605e9080ebc7ae06e0a32054325be71..42564d8e137705dbe6a6c1992bc4e6452ebe6ede 100644
--- a/paddlespeech/s2t/utils/ctc_utils.py
+++ b/paddlespeech/s2t/utils/ctc_utils.py
@@ -1,3 +1,4 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py
index 0dbaa0b6b77031d4b8e8aa29fcc9246458b8ab99..bc557b13083f5b4bf80b365d3ce3c2d6848361c0 100644
--- a/paddlespeech/s2t/utils/tensor_utils.py
+++ b/paddlespeech/s2t/utils/tensor_utils.py
@@ -58,7 +58,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
>>> a = paddle.ones(25, 300)
>>> b = paddle.ones(22, 300)
>>> c = paddle.ones(15, 300)
- >>> pad_sequence([a, b, c]).size()
+ >>> pad_sequence([a, b, c]).shape
paddle.Tensor([25, 3, 300])
Note:
@@ -79,10 +79,10 @@ def pad_sequence(sequences: List[paddle.Tensor],
# assuming trailing dimensions and type of all the Tensors
# in sequences are same and fetching those from sequences[0]
- max_size = sequences[0].size()
+ max_size = paddle.shape(sequences[0])
# (TODO Hui Zhang): slice not supprot `end==start`
# trailing_dims = max_size[1:]
- trailing_dims = max_size[1:] if max_size.ndim >= 2 else ()
+ trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
max_len = max([s.shape[0] for s in sequences])
if batch_first:
out_dims = (len(sequences), max_len) + trailing_dims
@@ -99,7 +99,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
if batch_first:
# TODO (Hui Zhang): set_value op not supprot `end==start`
# TODO (Hui Zhang): set_value op not support int16
- # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
+ # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
# out_tensor[i, :length, ...] = tensor
if length != 0:
out_tensor[i, :length] = tensor
@@ -145,7 +145,7 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
[ 4, 5, 6, 11, -1, -1],
[ 7, 8, 9, 11, -1, -1]])
"""
- # TODO(Hui Zhang): using comment code,
+ # TODO(Hui Zhang): using comment code,
#_sos = paddle.to_tensor(
# [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
#_eos = paddle.to_tensor(
diff --git a/paddlespeech/s2t/utils/text_grid.py b/paddlespeech/s2t/utils/text_grid.py
index cbd9856e40d72897cd08d3618178e60f7a34ea0f..e696f43d5fbb99fd39ff91af388d86a568e76abe 100644
--- a/paddlespeech/s2t/utils/text_grid.py
+++ b/paddlespeech/s2t/utils/text_grid.py
@@ -1,3 +1,4 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py
index a3a29fef962bf09b66df4e3c562dcab1790b5a79..11f50655f73a85df76a05abd080ee2ce41985ce7 100644
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@@ -25,6 +25,7 @@ from ..executor import BaseExecutor
from ..util import cli_server_register
from ..util import stats_wrapper
from paddlespeech.cli.log import logger
+from paddlespeech.resource import CommonTaskResource
from paddlespeech.server.engine.engine_pool import init_engine_pool
from paddlespeech.server.engine.engine_warmup import warm_up
from paddlespeech.server.restful.api import setup_router as setup_http_router
@@ -158,101 +159,30 @@ class ServerStatsExecutor():
"Please input correct speech task, choices = ['asr', 'tts']")
return False
- elif self.task.lower() == 'asr':
- try:
- from paddlespeech.cli.asr.infer import pretrained_models
- logger.info(
- "Here is the table of ASR pretrained models supported in the service."
- )
- self.show_support_models(pretrained_models)
-
- # show ASR static pretrained model
- from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models
- logger.info(
- "Here is the table of ASR static pretrained models supported in the service."
- )
- self.show_support_models(pretrained_models)
-
- return True
- except BaseException:
- logger.error(
- "Failed to get the table of ASR pretrained models supported in the service."
- )
- return False
-
- elif self.task.lower() == 'tts':
- try:
- from paddlespeech.cli.tts.infer import pretrained_models
- logger.info(
- "Here is the table of TTS pretrained models supported in the service."
- )
- self.show_support_models(pretrained_models)
-
- # show TTS static pretrained model
- from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models
- logger.info(
- "Here is the table of TTS static pretrained models supported in the service."
- )
- self.show_support_models(pretrained_models)
-
- return True
- except BaseException:
- logger.error(
- "Failed to get the table of TTS pretrained models supported in the service."
- )
- return False
+ try:
+ # Dynamic models
+ dynamic_pretrained_models = CommonTaskResource(
+ task=self.task, model_format='dynamic').pretrained_models
- elif self.task.lower() == 'cls':
- try:
- from paddlespeech.cli.cls.infer import pretrained_models
+ if len(dynamic_pretrained_models) > 0:
logger.info(
- "Here is the table of CLS pretrained models supported in the service."
- )
- self.show_support_models(pretrained_models)
-
- # show CLS static pretrained model
- from paddlespeech.server.engine.cls.paddleinference.cls_engine import pretrained_models
+ "Here is the table of {} pretrained models supported in the service.".
+ format(self.task.upper()))
+ self.show_support_models(dynamic_pretrained_models)
+
+ # Static models
+ static_pretrained_models = CommonTaskResource(
+ task=self.task, model_format='static').pretrained_models
+ if len(static_pretrained_models) > 0:
logger.info(
- "Here is the table of CLS static pretrained models supported in the service."
- )
+ "Here is the table of {} static pretrained models supported in the service.".
+ format(self.task.upper()))
self.show_support_models(pretrained_models)
- return True
- except BaseException:
- logger.error(
- "Failed to get the table of CLS pretrained models supported in the service."
- )
- return False
- elif self.task.lower() == 'text':
- try:
- from paddlespeech.cli.text.infer import pretrained_models
- logger.info(
- "Here is the table of Text pretrained models supported in the service."
- )
- self.show_support_models(pretrained_models)
+ return True
- return True
- except BaseException:
- logger.error(
- "Failed to get the table of Text pretrained models supported in the service."
- )
- return False
- elif self.task.lower() == 'vector':
- try:
- from paddlespeech.cli.vector.infer import pretrained_models
- logger.info(
- "Here is the table of Vector pretrained models supported in the service."
- )
- self.show_support_models(pretrained_models)
-
- return True
- except BaseException:
- logger.error(
- "Failed to get the table of Vector pretrained models supported in the service."
- )
- return False
- else:
+ except BaseException:
logger.error(
- f"Failed to get the table of {self.task} pretrained models supported in the service."
- )
+ "Failed to get the table of {} pretrained models supported in the service.".
+ format(self.task.upper()))
return False
diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
index 70bfcfb66223141158872cc468cb4111f3e0f887..14715bf35ca5ad463ad75c51d1fa6bb6e4aec041 100644
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import copy
import os
import sys
from typing import Optional
@@ -21,15 +20,14 @@ import paddle
from numpy import float32
from yacs.config import CfgNode
-from .pretrained_models import pretrained_models
from paddlespeech.cli.asr.infer import ASRExecutor
from paddlespeech.cli.log import logger
from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.resource import CommonTaskResource
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.speech import SpeechSegment
from paddlespeech.s2t.modules.ctc import CTCDecoder
from paddlespeech.s2t.transform.transformation import Transformation
-from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.s2t.utils.tensor_utils import add_sos_eos
from paddlespeech.s2t.utils.tensor_utils import pad_sequence
from paddlespeech.s2t.utils.utility import UpdateConfig
@@ -53,7 +51,7 @@ class PaddleASRConnectionHanddler:
logger.info(
"create an paddle asr connection handler to process the websocket connection"
)
- self.config = asr_engine.config
+ self.config = asr_engine.config # server config
self.model_config = asr_engine.executor.config
self.asr_engine = asr_engine
@@ -249,10 +247,15 @@ class PaddleASRConnectionHanddler:
def reset(self):
if "deepspeech2" in self.model_type:
# for deepspeech2
- self.chunk_state_h_box = copy.deepcopy(
- self.asr_engine.executor.chunk_state_h_box)
- self.chunk_state_c_box = copy.deepcopy(
- self.asr_engine.executor.chunk_state_c_box)
+ # init state
+ self.chunk_state_h_box = np.zeros(
+ (self.model_config.num_rnn_layers, 1,
+ self.model_config.rnn_layer_size),
+ dtype=float32)
+ self.chunk_state_c_box = np.zeros(
+ (self.model_config.num_rnn_layers, 1,
+ self.model_config.rnn_layer_size),
+ dtype=float32)
self.decoder.reset_decoder(batch_size=1)
self.device = None
@@ -696,7 +699,8 @@ class PaddleASRConnectionHanddler:
class ASRServerExecutor(ASRExecutor):
def __init__(self):
super().__init__()
- self.pretrained_models = pretrained_models
+ self.task_resource = CommonTaskResource(
+ task='asr', model_format='dynamic', inference_mode='online')
def _init_from_path(self,
model_type: str=None,
@@ -720,20 +724,19 @@ class ASRServerExecutor(ASRExecutor):
self.sample_rate = sample_rate
sample_rate_str = '16k' if sample_rate == 16000 else '8k'
tag = model_type + '-' + lang + '-' + sample_rate_str
-
+ self.task_resource.set_task_model(model_tag=tag)
if cfg_path is None or am_model is None or am_params is None:
logger.info(f"Load the pretrained model, tag = {tag}")
- res_path = self._get_pretrained_path(tag) # wenetspeech_zh
- self.res_path = res_path
+ self.res_path = self.task_resource.res_dir
self.cfg_path = os.path.join(
- res_path, self.pretrained_models[tag]['cfg_path'])
+ self.res_path, self.task_resource.res_dict['cfg_path'])
- self.am_model = os.path.join(res_path,
- self.pretrained_models[tag]['model'])
- self.am_params = os.path.join(res_path,
- self.pretrained_models[tag]['params'])
- logger.info(res_path)
+ self.am_model = os.path.join(self.res_path,
+ self.task_resource.res_dict['model'])
+ self.am_params = os.path.join(self.res_path,
+ self.task_resource.res_dict['params'])
+ logger.info(self.res_path)
else:
self.cfg_path = os.path.abspath(cfg_path)
self.am_model = os.path.abspath(am_model)
@@ -760,8 +763,8 @@ class ASRServerExecutor(ASRExecutor):
self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type, vocab=self.vocab)
- lm_url = self.pretrained_models[tag]['lm_url']
- lm_md5 = self.pretrained_models[tag]['lm_md5']
+ lm_url = self.task_resource.res_dict['lm_url']
+ lm_md5 = self.task_resource.res_dict['lm_md5']
logger.info(f"Start to load language model {lm_url}")
self.download_lm(
lm_url,
@@ -803,41 +806,11 @@ class ASRServerExecutor(ASRExecutor):
model_file=self.am_model,
params_file=self.am_params,
predictor_conf=self.am_predictor_conf)
-
- # decoder
- logger.info("ASR engine start to create the ctc decoder instance")
- self.decoder = CTCDecoder(
- odim=self.config.output_dim, # is in vocab
- enc_n_units=self.config.rnn_layer_size * 2,
- blank_id=self.config.blank_id,
- dropout_rate=0.0,
- reduction=True, # sum
- batch_average=True, # sum / batch_size
- grad_norm_type=self.config.get('ctc_grad_norm_type', None))
-
- # init decoder
- logger.info("ASR engine start to init the ctc decoder")
- cfg = self.config.decode
- decode_batch_size = 1 # for online
- self.decoder.init_decoder(
- decode_batch_size, self.text_feature.vocab_list,
- cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
- cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
- cfg.num_proc_bsearch)
-
- # init state box
- self.chunk_state_h_box = np.zeros(
- (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
- dtype=float32)
- self.chunk_state_c_box = np.zeros(
- (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
- dtype=float32)
-
elif "conformer" in model_type or "transformer" in model_type:
model_name = model_type[:model_type.rindex(
'_')] # model_type: {model_name}_{dataset}
logger.info(f"model name: {model_name}")
- model_class = dynamic_import(model_name, self.model_alias)
+ model_class = self.task_resource.get_model_class(model_name)
model_conf = self.config
model = model_class.from_config(model_conf)
self.model = model
@@ -847,10 +820,6 @@ class ASRServerExecutor(ASRExecutor):
model_dict = paddle.load(self.am_model)
self.model.set_state_dict(model_dict)
logger.info("create the transformer like model success")
-
- # update the ctc decoding
- self.searcher = CTCPrefixBeamSearch(self.config.decode)
- self.transformer_decode_reset()
else:
raise ValueError(f"Not support: {model_type}")
@@ -881,8 +850,8 @@ class ASREngine(BaseEngine):
self.executor = ASRServerExecutor()
try:
- default_dev = paddle.get_device()
- paddle.set_device(self.config.get("device", default_dev))
+ self.device = self.config.get("device", paddle.get_device())
+ paddle.set_device(self.device)
except BaseException as e:
logger.error(
f"Set device failed, please check if device '{self.device}' is already used and the parameter 'device' in the yaml file"
diff --git a/paddlespeech/server/engine/asr/online/pretrained_models.py b/paddlespeech/server/engine/asr/online/pretrained_models.py
deleted file mode 100644
index ff3778657e85efe1808b1cdb8e34d33ebad862d3..0000000000000000000000000000000000000000
--- a/paddlespeech/server/engine/asr/online/pretrained_models.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
- "deepspeech2online_aishell-zh-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
- 'md5':
- '98b87b171b7240b7cae6e07d8d0bc9be',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/deepspeech2_online/checkpoints/avg_1',
- 'model':
- 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
- 'params':
- 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
- 'lm_url':
- 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
- 'lm_md5':
- '29e02312deb2e59b3c8686c7966d4fe3'
- },
- "conformer_online_multicn-zh-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
- 'md5':
- '0ac93d390552336f2a906aec9e33c5fa',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/chunk_conformer/checkpoints/multi_cn',
- 'model':
- 'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
- 'params':
- 'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
- 'lm_url':
- 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
- 'lm_md5':
- '29e02312deb2e59b3c8686c7966d4fe3'
- },
- "conformer_online_wenetspeech-zh-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz',
- 'md5':
- 'b8c02632b04da34aca88459835be54a6',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/chunk_conformer/checkpoints/avg_10',
- 'model':
- 'exp/chunk_conformer/checkpoints/avg_10.pdparams',
- 'params':
- 'exp/chunk_conformer/checkpoints/avg_10.pdparams',
- 'lm_url':
- '',
- 'lm_md5':
- '',
- },
-}
diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
index 4234e1e2d41b40c5d896ebf86e7781f34e24c95c..35f0aa64a2b363d4ecce57a38dee7c7d82c3bcd6 100644
--- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
+++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
@@ -19,10 +19,10 @@ from typing import Optional
import paddle
from yacs.config import CfgNode
-from .pretrained_models import pretrained_models
from paddlespeech.cli.asr.infer import ASRExecutor
from paddlespeech.cli.log import logger
from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.resource import CommonTaskResource
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.modules.ctc import CTCDecoder
from paddlespeech.s2t.utils.utility import UpdateConfig
@@ -36,7 +36,8 @@ __all__ = ['ASREngine', 'PaddleASRConnectionHandler']
class ASRServerExecutor(ASRExecutor):
def __init__(self):
super().__init__()
- self.pretrained_models = pretrained_models
+ self.task_resource = CommonTaskResource(
+ task='asr', model_format='static')
def _init_from_path(self,
model_type: str='wenetspeech',
@@ -53,17 +54,17 @@ class ASRServerExecutor(ASRExecutor):
self.max_len = 50
sample_rate_str = '16k' if sample_rate == 16000 else '8k'
tag = model_type + '-' + lang + '-' + sample_rate_str
+ self.task_resource.set_task_model(model_tag=tag)
if cfg_path is None or am_model is None or am_params is None:
- res_path = self._get_pretrained_path(tag) # wenetspeech_zh
- self.res_path = res_path
+ self.res_path = self.task_resource.res_dir
self.cfg_path = os.path.join(
- res_path, self.pretrained_models[tag]['cfg_path'])
+ self.res_path, self.task_resource.res_dict['cfg_path'])
- self.am_model = os.path.join(res_path,
- self.pretrained_models[tag]['model'])
- self.am_params = os.path.join(res_path,
- self.pretrained_models[tag]['params'])
- logger.info(res_path)
+ self.am_model = os.path.join(self.res_path,
+ self.task_resource.res_dict['model'])
+ self.am_params = os.path.join(self.res_path,
+ self.task_resource.res_dict['params'])
+ logger.info(self.res_path)
logger.info(self.cfg_path)
logger.info(self.am_model)
logger.info(self.am_params)
@@ -89,8 +90,8 @@ class ASRServerExecutor(ASRExecutor):
self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type, vocab=self.vocab)
- lm_url = self.pretrained_models[tag]['lm_url']
- lm_md5 = self.pretrained_models[tag]['lm_md5']
+ lm_url = self.task_resource.res_dict['lm_url']
+ lm_md5 = self.task_resource.res_dict['lm_md5']
self.download_lm(
lm_url,
os.path.dirname(self.config.decode.lang_model_path), lm_md5)
diff --git a/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py b/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py
deleted file mode 100644
index c4c23e38cfb0b126e91090053054bcc50dc733e1..0000000000000000000000000000000000000000
--- a/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
- "deepspeech2offline_aishell-zh-16k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
- 'md5':
- '932c3593d62fe5c741b59b31318aa314',
- 'cfg_path':
- 'model.yaml',
- 'ckpt_path':
- 'exp/deepspeech2/checkpoints/avg_1',
- 'model':
- 'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
- 'params':
- 'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
- 'lm_url':
- 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
- 'lm_md5':
- '29e02312deb2e59b3c8686c7966d4fe3'
- },
-}
diff --git a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
index 44750c4747ed2ac3e01c1423d0ca65941d3b833e..389d56055ba617d1628b87e52aaf7301e9928c29 100644
--- a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
+++ b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
@@ -21,9 +21,9 @@ import numpy as np
import paddle
import yaml
-from .pretrained_models import pretrained_models
from paddlespeech.cli.cls.infer import CLSExecutor
from paddlespeech.cli.log import logger
+from paddlespeech.resource import CommonTaskResource
from paddlespeech.server.engine.base_engine import BaseEngine
from paddlespeech.server.utils.paddle_predictor import init_predictor
from paddlespeech.server.utils.paddle_predictor import run_model
@@ -34,11 +34,12 @@ __all__ = ['CLSEngine', 'PaddleCLSConnectionHandler']
class CLSServerExecutor(CLSExecutor):
def __init__(self):
super().__init__()
- self.pretrained_models = pretrained_models
+ self.task_resource = CommonTaskResource(
+ task='cls', model_format='static')
def _init_from_path(
self,
- model_type: str='panns_cnn14',
+ model_type: str='panns_cnn14_audioset',
cfg_path: Optional[os.PathLike]=None,
model_path: Optional[os.PathLike]=None,
params_path: Optional[os.PathLike]=None,
@@ -50,15 +51,16 @@ class CLSServerExecutor(CLSExecutor):
if cfg_path is None or model_path is None or params_path is None or label_file is None:
tag = model_type + '-' + '32k'
- self.res_path = self._get_pretrained_path(tag)
+ self.task_resource.set_task_model(model_tag=tag)
+ self.res_path = self.task_resource.res_dir
self.cfg_path = os.path.join(
- self.res_path, self.pretrained_models[tag]['cfg_path'])
+ self.res_path, self.task_resource.res_dict['cfg_path'])
self.model_path = os.path.join(
- self.res_path, self.pretrained_models[tag]['model_path'])
+ self.res_path, self.task_resource.res_dict['model_path'])
self.params_path = os.path.join(
- self.res_path, self.pretrained_models[tag]['params_path'])
+ self.res_path, self.task_resource.res_dict['params_path'])
self.label_file = os.path.join(
- self.res_path, self.pretrained_models[tag]['label_file'])
+ self.res_path, self.task_resource.res_dict['label_file'])
else:
self.cfg_path = os.path.abspath(cfg_path)
self.model_path = os.path.abspath(model_path)
diff --git a/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py b/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py
deleted file mode 100644
index e4914874600c2198e434d267c775dea66f3f252a..0000000000000000000000000000000000000000
--- a/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
- "panns_cnn6-32k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
- 'md5':
- 'da087c31046d23281d8ec5188c1967da',
- 'cfg_path':
- 'panns.yaml',
- 'model_path':
- 'inference.pdmodel',
- 'params_path':
- 'inference.pdiparams',
- 'label_file':
- 'audioset_labels.txt',
- },
- "panns_cnn10-32k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
- 'md5':
- '5460cc6eafbfaf0f261cc75b90284ae1',
- 'cfg_path':
- 'panns.yaml',
- 'model_path':
- 'inference.pdmodel',
- 'params_path':
- 'inference.pdiparams',
- 'label_file':
- 'audioset_labels.txt',
- },
- "panns_cnn14-32k": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
- 'md5':
- 'ccc80b194821274da79466862b2ab00f',
- 'cfg_path':
- 'panns.yaml',
- 'model_path':
- 'inference.pdmodel',
- 'params_path':
- 'inference.pdiparams',
- 'label_file':
- 'audioset_labels.txt',
- },
-}
diff --git a/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py b/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py
deleted file mode 100644
index 789f5be7d7ca16965459fec6df7e40f7713ee104..0000000000000000000000000000000000000000
--- a/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# support online model
-pretrained_models = {
- # fastspeech2
- "fastspeech2_csmsc_onnx-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
- 'md5':
- 'fd3ad38d83273ad51f0ea4f4abf3ab4e',
- 'ckpt': ['fastspeech2_csmsc.onnx'],
- 'phones_dict':
- 'phone_id_map.txt',
- 'sample_rate':
- 24000,
- },
- "fastspeech2_cnndecoder_csmsc_onnx-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
- 'md5':
- '5f70e1a6bcd29d72d54e7931aa86f266',
- 'ckpt': [
- 'fastspeech2_csmsc_am_encoder_infer.onnx',
- 'fastspeech2_csmsc_am_decoder.onnx',
- 'fastspeech2_csmsc_am_postnet.onnx',
- ],
- 'speech_stats':
- 'speech_stats.npy',
- 'phones_dict':
- 'phone_id_map.txt',
- 'sample_rate':
- 24000,
- },
-
- # mb_melgan
- "mb_melgan_csmsc_onnx-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
- 'md5':
- '5b83ec746e8414bc29032d954ffd07ec',
- 'ckpt':
- 'mb_melgan_csmsc.onnx',
- 'sample_rate':
- 24000,
- },
-
- # hifigan
- "hifigan_csmsc_onnx-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
- 'md5':
- '1a7dc0385875889e46952e50c0994a6b',
- 'ckpt':
- 'hifigan_csmsc.onnx',
- 'sample_rate':
- 24000,
- },
-}
diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
index fd438da0314881875db0dfabf13ec9e04a8770cf..cb9155a2d5795f700521b0be8e0ad6b887904099 100644
--- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
@@ -20,9 +20,9 @@ from typing import Optional
import numpy as np
import paddle
-from .pretrained_models import pretrained_models
from paddlespeech.cli.log import logger
from paddlespeech.cli.tts.infer import TTSExecutor
+from paddlespeech.resource import CommonTaskResource
from paddlespeech.server.engine.base_engine import BaseEngine
from paddlespeech.server.utils.audio_process import float2pcm
from paddlespeech.server.utils.onnx_infer import get_sess
@@ -37,7 +37,7 @@ __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']
class TTSServerExecutor(TTSExecutor):
def __init__(self):
super().__init__()
- self.pretrained_models = pretrained_models
+ self.task_resource = CommonTaskResource(task='tts', model_format='onnx')
def _init_from_path(
self,
@@ -66,16 +66,21 @@ class TTSServerExecutor(TTSExecutor):
return
# am
am_tag = am + '-' + lang
+ self.task_resource.set_task_model(
+ model_tag=am_tag,
+ model_type=0, # am
+ version=None, # default version
+ )
+ self.am_res_path = self.task_resource.res_dir
if am == "fastspeech2_csmsc_onnx":
# get model info
if am_ckpt is None or phones_dict is None:
- am_res_path = self._get_pretrained_path(am_tag)
- self.am_res_path = am_res_path
self.am_ckpt = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['ckpt'][0])
+ self.am_res_path, self.task_resource.res_dict['ckpt'][0])
# must have phones_dict in acoustic
self.phones_dict = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['phones_dict'])
+ self.am_res_path,
+ self.task_resource.res_dict['phones_dict'])
else:
self.am_ckpt = os.path.abspath(am_ckpt[0])
@@ -88,19 +93,19 @@ class TTSServerExecutor(TTSExecutor):
elif am == "fastspeech2_cnndecoder_csmsc_onnx":
if am_ckpt is None or am_stat is None or phones_dict is None:
- am_res_path = self._get_pretrained_path(am_tag)
- self.am_res_path = am_res_path
self.am_encoder_infer = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['ckpt'][0])
+ self.am_res_path, self.task_resource.res_dict['ckpt'][0])
self.am_decoder = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['ckpt'][1])
+ self.am_res_path, self.task_resource.res_dict['ckpt'][1])
self.am_postnet = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['ckpt'][2])
+ self.am_res_path, self.task_resource.res_dict['ckpt'][2])
# must have phones_dict in acoustic
self.phones_dict = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['phones_dict'])
+ self.am_res_path,
+ self.task_resource.res_dict['phones_dict'])
self.am_stat = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['speech_stats'])
+ self.am_res_path,
+ self.task_resource.res_dict['speech_stats'])
else:
self.am_encoder_infer = os.path.abspath(am_ckpt[0])
@@ -125,11 +130,15 @@ class TTSServerExecutor(TTSExecutor):
# voc model info
voc_tag = voc + '-' + lang
+ self.task_resource.set_task_model(
+ model_tag=voc_tag,
+ model_type=1, # vocoder
+ version=None, # default version
+ )
if voc_ckpt is None:
- voc_res_path = self._get_pretrained_path(voc_tag)
- self.voc_res_path = voc_res_path
+ self.voc_res_path = self.task_resource.voc_res_dir
self.voc_ckpt = os.path.join(
- voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
+ self.voc_res_path, self.task_resource.voc_res_dict['ckpt'])
else:
self.voc_ckpt = os.path.abspath(voc_ckpt)
self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt))
diff --git a/paddlespeech/server/engine/tts/online/python/pretrained_models.py b/paddlespeech/server/engine/tts/online/python/pretrained_models.py
deleted file mode 100644
index bf6aded51168c2c21172ec8101413b4cb0e05154..0000000000000000000000000000000000000000
--- a/paddlespeech/server/engine/tts/online/python/pretrained_models.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# support online model
-pretrained_models = {
- # fastspeech2
- "fastspeech2_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
- 'md5':
- '637d28a5e53aa60275612ba4393d5f22',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_76000.pdz',
- 'speech_stats':
- 'speech_stats.npy',
- 'phones_dict':
- 'phone_id_map.txt',
- },
- "fastspeech2_cnndecoder_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip',
- 'md5':
- '6eb28e22ace73e0ebe7845f86478f89f',
- 'config':
- 'cnndecoder.yaml',
- 'ckpt':
- 'snapshot_iter_153000.pdz',
- 'speech_stats':
- 'speech_stats.npy',
- 'phones_dict':
- 'phone_id_map.txt',
- },
-
- # mb_melgan
- "mb_melgan_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
- 'md5':
- 'ee5f0604e20091f0d495b6ec4618b90d',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_1000000.pdz',
- 'speech_stats':
- 'feats_stats.npy',
- },
-
- # hifigan
- "hifigan_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
- 'md5':
- 'dd40a3d88dfcf64513fba2f0f961ada6',
- 'config':
- 'default.yaml',
- 'ckpt':
- 'snapshot_iter_2500000.pdz',
- 'speech_stats':
- 'feats_stats.npy',
- },
-}
diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py
index eaa179929f40625bafc35a58de5d30a8808830e6..5783b9fc0bd0516253af9216e8adc95d3a5aec99 100644
--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@@ -22,9 +22,9 @@ import paddle
import yaml
from yacs.config import CfgNode
-from .pretrained_models import pretrained_models
from paddlespeech.cli.log import logger
from paddlespeech.cli.tts.infer import TTSExecutor
+from paddlespeech.resource import CommonTaskResource
from paddlespeech.server.engine.base_engine import BaseEngine
from paddlespeech.server.utils.audio_process import float2pcm
from paddlespeech.server.utils.util import denorm
@@ -32,7 +32,6 @@ from paddlespeech.server.utils.util import get_chunks
from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.modules.normalizer import ZScore
-from paddlespeech.utils.dynamic_import import dynamic_import
__all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']
@@ -40,7 +39,9 @@ __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']
class TTSServerExecutor(TTSExecutor):
def __init__(self):
super().__init__()
- self.pretrained_models = pretrained_models
+ self.task_resource = CommonTaskResource(
+ task='tts', model_format='static', inference_mode='online')
+
def get_model_info(self,
field: str,
@@ -61,7 +62,7 @@ class TTSServerExecutor(TTSExecutor):
[Tensor]: standard deviation
"""
- model_class = dynamic_import(model_name, self.model_alias)
+ model_class = self.task_resource.get_model_class(model_name)
if field == "am":
odim = self.am_config.n_mels
@@ -106,20 +107,24 @@ class TTSServerExecutor(TTSExecutor):
return
# am model info
am_tag = am + '-' + lang
+ self.task_resource.set_task_model(
+ model_tag=am_tag,
+ model_type=0, # am
+ version=None, # default version
+ )
if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None:
- am_res_path = self._get_pretrained_path(am_tag)
- self.am_res_path = am_res_path
- self.am_config = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['config'])
- self.am_ckpt = os.path.join(am_res_path,
- self.pretrained_models[am_tag]['ckpt'])
+ self.am_res_path = self.task_resource.res_dir
+ self.am_config = os.path.join(self.am_res_path,
+ self.task_resource.res_dict['config'])
+ self.am_ckpt = os.path.join(self.am_res_path,
+ self.task_resource.res_dict['ckpt'])
self.am_stat = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['speech_stats'])
+ self.am_res_path, self.task_resource.res_dict['speech_stats'])
# must have phones_dict in acoustic
self.phones_dict = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['phones_dict'])
+ self.am_res_path, self.task_resource.res_dict['phones_dict'])
print("self.phones_dict:", self.phones_dict)
- logger.info(am_res_path)
+ logger.info(self.am_res_path)
logger.info(self.am_config)
logger.info(self.am_ckpt)
else:
@@ -135,16 +140,21 @@ class TTSServerExecutor(TTSExecutor):
# voc model info
voc_tag = voc + '-' + lang
+ self.task_resource.set_task_model(
+ model_tag=voc_tag,
+ model_type=1, # vocoder
+ version=None, # default version
+ )
if voc_ckpt is None or voc_config is None or voc_stat is None:
- voc_res_path = self._get_pretrained_path(voc_tag)
- self.voc_res_path = voc_res_path
+ self.voc_res_path = self.task_resource.voc_res_dir
self.voc_config = os.path.join(
- voc_res_path, self.pretrained_models[voc_tag]['config'])
+ self.voc_res_path, self.task_resource.voc_res_dict['config'])
self.voc_ckpt = os.path.join(
- voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
+ self.voc_res_path, self.task_resource.voc_res_dict['ckpt'])
self.voc_stat = os.path.join(
- voc_res_path, self.pretrained_models[voc_tag]['speech_stats'])
- logger.info(voc_res_path)
+ self.voc_res_path,
+ self.task_resource.voc_res_dict['speech_stats'])
+ logger.info(self.voc_res_path)
logger.info(self.voc_config)
logger.info(self.voc_ckpt)
else:
@@ -184,8 +194,8 @@ class TTSServerExecutor(TTSExecutor):
am, am_mu, am_std = self.get_model_info("am", self.am_name,
self.am_ckpt, self.am_stat)
am_normalizer = ZScore(am_mu, am_std)
- am_inference_class = dynamic_import(self.am_name + '_inference',
- self.model_alias)
+ am_inference_class = self.task_resource.get_model_class(
+ self.am_name + '_inference')
self.am_inference = am_inference_class(am_normalizer, am)
self.am_inference.eval()
print("acoustic model done!")
@@ -195,8 +205,8 @@ class TTSServerExecutor(TTSExecutor):
voc, voc_mu, voc_std = self.get_model_info("voc", self.voc_name,
self.voc_ckpt, self.voc_stat)
voc_normalizer = ZScore(voc_mu, voc_std)
- voc_inference_class = dynamic_import(self.voc_name + '_inference',
- self.model_alias)
+ voc_inference_class = self.task_resource.get_model_class(self.voc_name +
+ '_inference')
self.voc_inference = voc_inference_class(voc_normalizer, voc)
self.voc_inference.eval()
print("voc done!")
diff --git a/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py b/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py
deleted file mode 100644
index 9618a7a697765f532a172c551b6be733a68a1bec..0000000000000000000000000000000000000000
--- a/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Static model applied on paddle inference
-pretrained_models = {
- # speedyspeech
- "speedyspeech_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip',
- 'md5':
- 'f10cbdedf47dc7a9668d2264494e1823',
- 'model':
- 'speedyspeech_csmsc.pdmodel',
- 'params':
- 'speedyspeech_csmsc.pdiparams',
- 'phones_dict':
- 'phone_id_map.txt',
- 'tones_dict':
- 'tone_id_map.txt',
- 'sample_rate':
- 24000,
- },
- # fastspeech2
- "fastspeech2_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip',
- 'md5':
- '9788cd9745e14c7a5d12d32670b2a5a7',
- 'model':
- 'fastspeech2_csmsc.pdmodel',
- 'params':
- 'fastspeech2_csmsc.pdiparams',
- 'phones_dict':
- 'phone_id_map.txt',
- 'sample_rate':
- 24000,
- },
- # pwgan
- "pwgan_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip',
- 'md5':
- 'e3504aed9c5a290be12d1347836d2742',
- 'model':
- 'pwgan_csmsc.pdmodel',
- 'params':
- 'pwgan_csmsc.pdiparams',
- 'sample_rate':
- 24000,
- },
- # mb_melgan
- "mb_melgan_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip',
- 'md5':
- 'ac6eee94ba483421d750433f4c3b8d36',
- 'model':
- 'mb_melgan_csmsc.pdmodel',
- 'params':
- 'mb_melgan_csmsc.pdiparams',
- 'sample_rate':
- 24000,
- },
- # hifigan
- "hifigan_csmsc-zh": {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip',
- 'md5':
- '7edd8c436b3a5546b3a7cb8cff9d5a0c',
- 'model':
- 'hifigan_csmsc.pdmodel',
- 'params':
- 'hifigan_csmsc.pdiparams',
- 'sample_rate':
- 24000,
- },
-}
diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
index 1676801e7cd1f9d7a5843a5bf7ac8b339eaf8f54..ab5b721ff0041c803c1b07fc4256a85040330909 100644
--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@@ -24,9 +24,9 @@ import paddle
import soundfile as sf
from scipy.io import wavfile
-from .pretrained_models import pretrained_models
from paddlespeech.cli.log import logger
from paddlespeech.cli.tts.infer import TTSExecutor
+from paddlespeech.resource import CommonTaskResource
from paddlespeech.server.engine.base_engine import BaseEngine
from paddlespeech.server.utils.audio_process import change_speed
from paddlespeech.server.utils.errors import ErrorCode
@@ -42,7 +42,8 @@ __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']
class TTSServerExecutor(TTSExecutor):
def __init__(self):
super().__init__()
- self.pretrained_models = pretrained_models
+ self.task_resource = CommonTaskResource(
+ task='tts', model_format='static')
def _init_from_path(
self,
@@ -68,19 +69,23 @@ class TTSServerExecutor(TTSExecutor):
return
# am
am_tag = am + '-' + lang
+ self.task_resource.set_task_model(
+ model_tag=am_tag,
+ model_type=0, # am
+ version=None, # default version
+ )
if am_model is None or am_params is None or phones_dict is None:
- am_res_path = self._get_pretrained_path(am_tag)
- self.am_res_path = am_res_path
- self.am_model = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['model'])
- self.am_params = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['params'])
+ self.am_res_path = self.task_resource.res_dir
+ self.am_model = os.path.join(self.am_res_path,
+ self.task_resource.res_dict['model'])
+ self.am_params = os.path.join(self.am_res_path,
+ self.task_resource.res_dict['params'])
# must have phones_dict in acoustic
self.phones_dict = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['phones_dict'])
- self.am_sample_rate = self.pretrained_models[am_tag]['sample_rate']
+ self.am_res_path, self.task_resource.res_dict['phones_dict'])
+ self.am_sample_rate = self.task_resource.res_dict['sample_rate']
- logger.info(am_res_path)
+ logger.info(self.am_res_path)
logger.info(self.am_model)
logger.info(self.am_params)
else:
@@ -93,32 +98,36 @@ class TTSServerExecutor(TTSExecutor):
# for speedyspeech
self.tones_dict = None
- if 'tones_dict' in self.pretrained_models[am_tag]:
+ if 'tones_dict' in self.task_resource.res_dict:
self.tones_dict = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['tones_dict'])
+ self.am_res_path, self.task_resource.res_dict['tones_dict'])
if tones_dict:
self.tones_dict = tones_dict
# for multi speaker fastspeech2
self.speaker_dict = None
- if 'speaker_dict' in self.pretrained_models[am_tag]:
+ if 'speaker_dict' in self.task_resource.res_dict:
self.speaker_dict = os.path.join(
- am_res_path, self.pretrained_models[am_tag]['speaker_dict'])
+ self.am_res_path, self.task_resource.res_dict['speaker_dict'])
if speaker_dict:
self.speaker_dict = speaker_dict
# voc
voc_tag = voc + '-' + lang
+ self.task_resource.set_task_model(
+ model_tag=voc_tag,
+ model_type=1, # vocoder
+ version=None, # default version
+ )
if voc_model is None or voc_params is None:
- voc_res_path = self._get_pretrained_path(voc_tag)
- self.voc_res_path = voc_res_path
+ self.voc_res_path = self.task_resource.voc_res_dir
self.voc_model = os.path.join(
- voc_res_path, self.pretrained_models[voc_tag]['model'])
+ self.voc_res_path, self.task_resource.voc_res_dict['model'])
self.voc_params = os.path.join(
- voc_res_path, self.pretrained_models[voc_tag]['params'])
- self.voc_sample_rate = self.pretrained_models[voc_tag][
+ self.voc_res_path, self.task_resource.voc_res_dict['params'])
+ self.voc_sample_rate = self.task_resource.voc_res_dict[
'sample_rate']
- logger.info(voc_res_path)
+ logger.info(self.voc_res_path)
logger.info(self.voc_model)
logger.info(self.voc_params)
else:
diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
index c70821e78fe6e4063d74e8c5608ede225ed1b230..4c733dc9b05ba8880d415482d7c194c910039f92 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
@@ -243,8 +243,7 @@ def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
- parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ parser.add_argument("--config", type=str, help="HiFiGAN config file.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
index 27ffded63b3621c0f2110815b27fd4420ef0bc5a..3b3ebb4788e2cd4b58119996f74c8dc0d1bfc46b 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
@@ -233,7 +233,7 @@ def main():
parser = argparse.ArgumentParser(
description="Train a Multi-Band MelGAN model.")
parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ "--config", type=str, help="Multi-Band MelGAN config file.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
index 92de7a2c4e7a04ed28b7b30dfa47be4796acc93f..b26407028928cec639d47f576e2bb1f6766e1990 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
@@ -208,7 +208,7 @@ def main():
parser = argparse.ArgumentParser(
description="Train a ParallelWaveGAN model.")
parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ "--config", type=str, help="ParallelWaveGAN config file.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
index be3ba74251d92cf90be713651837205fa8dc582a..a87cc7a182fdc88d9770021969b4d4248d87e83a 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
@@ -224,8 +224,7 @@ def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="Train a Style MelGAN model.")
- parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ parser.add_argument("--config", type=str, help="Style MelGAN config file.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py
index 45ecb269bac033fed4287e5083ced6ce92b89f35..da48b6b99700ed49e5c815bf6b6f14c8eecfae95 100644
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -160,7 +160,7 @@ def main():
parser = argparse.ArgumentParser(description="Train a TransformerTTS "
"model with LJSpeech TTS dataset.")
parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ "--config", type=str, help="TransformerTTS config file.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index b921f92af75ba367d702be94050e563538e7c755..dbda8b7177bca068ecaeabe41679a93e153aba35 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -226,9 +226,8 @@ def train_sp(args, config):
def main():
# parse args and config and redirect to train_sp
- parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
- parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ parser = argparse.ArgumentParser(description="Train a VITS model.")
+ parser.add_argument("--config", type=str, help="VITS config file")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py
index 8661d311d218bda58142a846f3dedce5a07ffabf..cf24ea26888002afcfba19449ec8ac5db6efe517 100644
--- a/paddlespeech/t2s/exps/wavernn/train.py
+++ b/paddlespeech/t2s/exps/wavernn/train.py
@@ -180,8 +180,7 @@ def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="Train a WaveRNN model.")
- parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ parser.add_argument("--config", type=str, help="WaveRNN config file.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/speechx/README.md b/speechx/README.md
index f75d8ac4eb42c4ca257279763006bd3fc61dee42..cd1cd62c154c28e007d5b20c8db77fa712f1e071 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -44,13 +44,13 @@ More details please see `README.md` under `examples`.
> If using docker please check `--privileged` is set when `docker run`.
* Fatal error at startup: `a function redirection which is mandatory for this platform-tool combination cannot be set up`
-```
+```bash
apt-get install libc6-dbg
```
* Install
-```
+```bash
pushd tools
./setup_valgrind.sh
popd
@@ -59,4 +59,4 @@ popd
## TODO
### Deepspeech2 with linear feature
-* DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result.
+* DecibelNormalizer: there is a small difference between the offline and online db norm. The computation of online db norm reads features chunk by chunk, which causes the feature size to be different different with offline db norm. In `normalizer.cc:73`, the `samples.size()` is different, which causes the different result.
diff --git a/speechx/examples/CMakeLists.txt b/speechx/examples/CMakeLists.txt
deleted file mode 100644
index 3c274a20a806bb84cf04c1374bcec126e657321a..0000000000000000000000000000000000000000
--- a/speechx/examples/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-add_subdirectory(ds2_ol)
-add_subdirectory(dev)
\ No newline at end of file
diff --git a/speechx/examples/README.md b/speechx/examples/README.md
index b18c88e048dc242532e43f032c8098b872ef831a..1b977523c30886461b93256ff95f55ebbfe2602d 100644
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
@@ -22,14 +22,7 @@ netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host
## For Developer
-> Warning: Only for developer, make sure you know what's it.
+> Reminder: Only for developer, make sure you know what's it.
-* dev - for speechx developer, using for test.
+* codelab - for speechx developer, using for test.
-## Build WFST
-
-> Warning: Using below example when you know what's it.
-
-* text_lm - process text for build lm
-* ngram - using to build NGram ARPA lm.
-* wfst - build wfst for TLG.
diff --git a/speechx/examples/codelab/README.md b/speechx/examples/codelab/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f89184de9b1795f82daa23f79207c4047b55bc46
--- /dev/null
+++ b/speechx/examples/codelab/README.md
@@ -0,0 +1,8 @@
+# Codelab
+
+## introduction
+
+> The below is for developing and offline testing. Do not run it only if you know what it is.
+* nnet
+* feat
+* decoder
diff --git a/speechx/examples/ds2_ol/decoder/.gitignore b/speechx/examples/codelab/decoder/.gitignore
similarity index 100%
rename from speechx/examples/ds2_ol/decoder/.gitignore
rename to speechx/examples/codelab/decoder/.gitignore
diff --git a/speechx/examples/ds2_ol/decoder/README.md b/speechx/examples/codelab/decoder/README.md
similarity index 100%
rename from speechx/examples/ds2_ol/decoder/README.md
rename to speechx/examples/codelab/decoder/README.md
diff --git a/speechx/examples/codelab/decoder/path.sh b/speechx/examples/codelab/decoder/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9d22917439cab4963e16903d30fce6e991f13e76
--- /dev/null
+++ b/speechx/examples/codelab/decoder/path.sh
@@ -0,0 +1,14 @@
+# This contains the locations of binarys build required for running the examples.
+
+SPEECHX_ROOT=$PWD/../../../
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
+
+SPEECHX_TOOLS=$SPEECHX_ROOT/tools
+TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
+
+[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
+
+export LC_AL=C
+
+SPEECHX_BIN=$SPEECHX_ROOT/build/speechx/decoder:$SPEECHX_ROOT/build/speechx/frontend/audio
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/ds2_ol/decoder/run.sh b/speechx/examples/codelab/decoder/run.sh
similarity index 94%
rename from speechx/examples/ds2_ol/decoder/run.sh
rename to speechx/examples/codelab/decoder/run.sh
index 40501eb41ba2ea5efc3477c12aa8f9abb3e9520c..a911eb033fdaee6f641fe83b7f3e6306e8809478 100755
--- a/speechx/examples/ds2_ol/decoder/run.sh
+++ b/speechx/examples/codelab/decoder/run.sh
@@ -54,7 +54,7 @@ cmvn=$exp_dir/cmvn.ark
export GLOG_logtostderr=1
# dump json cmvn to kaldi
-cmvn-json2kaldi \
+cmvn_json2kaldi_main \
--json_file $ckpt_dir/data/mean_std.json \
--cmvn_write_path $cmvn \
--binary=false
@@ -62,17 +62,17 @@ echo "convert json cmvn to kaldi ark."
# generate linear feature as streaming
-linear-spectrogram-wo-db-norm-ol \
+compute_linear_spectrogram_main \
--wav_rspecifier=scp:$data/wav.scp \
--feature_wspecifier=ark,t:$feat_wspecifier \
--cmvn_file=$cmvn
echo "compute linear spectrogram feature."
# run ctc beam search decoder as streaming
-ctc-prefix-beam-search-decoder-ol \
+ctc_prefix_beam_search_decoder_main \
--result_wspecifier=ark,t:$exp_dir/result.txt \
--feature_rspecifier=ark:$feat_wspecifier \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \
--dict_file=$vocb_dir/vocab.txt \
- --lm_path=$lm
\ No newline at end of file
+ --lm_path=$lm
diff --git a/speechx/examples/ds2_ol/decoder/valgrind.sh b/speechx/examples/codelab/decoder/valgrind.sh
similarity index 100%
rename from speechx/examples/ds2_ol/decoder/valgrind.sh
rename to speechx/examples/codelab/decoder/valgrind.sh
diff --git a/speechx/examples/ds2_ol/feat/README.md b/speechx/examples/codelab/feat/README.md
similarity index 58%
rename from speechx/examples/ds2_ol/feat/README.md
rename to speechx/examples/codelab/feat/README.md
index 89cb79eca1739cb560743e78708e165061376f57..e59e02bf9a65642b4807b63acf439fa376445bd8 100644
--- a/speechx/examples/ds2_ol/feat/README.md
+++ b/speechx/examples/codelab/feat/README.md
@@ -2,6 +2,6 @@
ASR audio feature test bins. We using theses bins to test linaer/fbank/mfcc asr feature as streaming manner.
-* linear_spectrogram_without_db_norm_main.cc
+* compute_linear_spectrogram_main.cc
-compute linear spectrogram w/o db norm in streaming manner.
+compute linear spectrogram without db norm in streaming manner.
diff --git a/speechx/examples/dev/glog/path.sh b/speechx/examples/codelab/feat/path.sh
similarity index 82%
rename from speechx/examples/dev/glog/path.sh
rename to speechx/examples/codelab/feat/path.sh
index 1a96a861aa82df2eff926a844990d988935696be..3b89d01e9ec0bfea521388957ef0d73521b5db30 100644
--- a/speechx/examples/dev/glog/path.sh
+++ b/speechx/examples/codelab/feat/path.sh
@@ -1,15 +1,14 @@
# This contains the locations of binarys build required for running the examples.
SPEECHX_ROOT=$PWD/../../../
+SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
-
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
-SPEECHX_BIN=$SPEECHX_EXAMPLES/dev/glog
-export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
-
export LC_AL=C
+
+SPEECHX_BIN=$SPEECHX_ROOT/build/speechx/decoder:$SPEECHX_ROOT/build/speechx/frontend/audio
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/ds2_ol/feat/run.sh b/speechx/examples/codelab/feat/run.sh
similarity index 95%
rename from speechx/examples/ds2_ol/feat/run.sh
rename to speechx/examples/codelab/feat/run.sh
index 75777927547350bf7b9549dcb4a34e2d4ee94841..1fa37f981b4942c449d4a779b0de8d0de71b9c4a 100755
--- a/speechx/examples/ds2_ol/feat/run.sh
+++ b/speechx/examples/codelab/feat/run.sh
@@ -41,14 +41,14 @@ mkdir -p $exp_dir
# 3. run feat
export GLOG_logtostderr=1
-cmvn-json2kaldi \
+cmvn_json2kaldi_main \
--json_file $model_dir/data/mean_std.json \
--cmvn_write_path $exp_dir/cmvn.ark \
--binary=false
echo "convert json cmvn to kaldi ark."
-linear-spectrogram-wo-db-norm-ol \
+compute_linear_spectrogram_main \
--wav_rspecifier=scp:$data_dir/wav.scp \
--feature_wspecifier=ark,t:$exp_dir/feats.ark \
--cmvn_file=$exp_dir/cmvn.ark
diff --git a/speechx/examples/ds2_ol/feat/valgrind.sh b/speechx/examples/codelab/feat/valgrind.sh
similarity index 93%
rename from speechx/examples/ds2_ol/feat/valgrind.sh
rename to speechx/examples/codelab/feat/valgrind.sh
index f8aab63f8c99add2c7fd8c66db7f9a71ef0c6bf3..ea50fdc23327bb073bcc2fb9bee0839ce2784b6f 100755
--- a/speechx/examples/ds2_ol/feat/valgrind.sh
+++ b/speechx/examples/codelab/feat/valgrind.sh
@@ -17,7 +17,7 @@ feat_wspecifier=./feats.ark
cmvn=./cmvn.ark
valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
- linear_spectrogram_main \
+ compute_linear_spectrogram_main \
--wav_rspecifier=scp:$model_dir/wav.scp \
--feature_wspecifier=ark,t:$feat_wspecifier \
--cmvn_write_path=$cmvn
diff --git a/speechx/examples/ds2_ol/nnet/.gitignore b/speechx/examples/codelab/nnet/.gitignore
similarity index 100%
rename from speechx/examples/ds2_ol/nnet/.gitignore
rename to speechx/examples/codelab/nnet/.gitignore
diff --git a/speechx/examples/ds2_ol/nnet/README.md b/speechx/examples/codelab/nnet/README.md
similarity index 100%
rename from speechx/examples/ds2_ol/nnet/README.md
rename to speechx/examples/codelab/nnet/README.md
diff --git a/speechx/examples/ds2_ol/feat/path.sh b/speechx/examples/codelab/nnet/path.sh
similarity index 81%
rename from speechx/examples/ds2_ol/feat/path.sh
rename to speechx/examples/codelab/nnet/path.sh
index ad2b6a4e9bb897ff7565bf4e9dc7bcc2440151af..7d395d648348651f4a20694b35c3105d9112d392 100644
--- a/speechx/examples/ds2_ol/feat/path.sh
+++ b/speechx/examples/codelab/nnet/path.sh
@@ -1,7 +1,7 @@
# This contains the locations of binarys build required for running the examples.
SPEECHX_ROOT=$PWD/../../../
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
@@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
export LC_AL=C
-SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/feat
+SPEECHX_BIN=$SPEECHX_BUILD/codelab/nnet
export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/ds2_ol/nnet/run.sh b/speechx/examples/codelab/nnet/run.sh
similarity index 75%
rename from speechx/examples/ds2_ol/nnet/run.sh
rename to speechx/examples/codelab/nnet/run.sh
index 10029f7e8c79e479d11d7b60760bb64b4a510084..842499ba2048a2fc37d1699f077be1de24073e0a 100755
--- a/speechx/examples/ds2_ol/nnet/run.sh
+++ b/speechx/examples/codelab/nnet/run.sh
@@ -20,19 +20,10 @@ if [ ! -f data/model/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz ];
popd
fi
-# produce wav scp
-if [ ! -f data/wav.scp ]; then
- mkdir -p data
- pushd data
- wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
- echo "utt1 " $PWD/zh.wav > wav.scp
- popd
-fi
-
ckpt_dir=./data/model
model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
-ds2-model-ol-test \
+ds2_model_test_main \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams
diff --git a/speechx/examples/ds2_ol/nnet/valgrind.sh b/speechx/examples/codelab/nnet/valgrind.sh
similarity index 71%
rename from speechx/examples/ds2_ol/nnet/valgrind.sh
rename to speechx/examples/codelab/nnet/valgrind.sh
index 2a08c6082f7803538a26b97dc34d968642b16f81..a5aab6637d1e25aed5b6825e9277fad556644a48 100755
--- a/speechx/examples/ds2_ol/nnet/valgrind.sh
+++ b/speechx/examples/codelab/nnet/valgrind.sh
@@ -12,9 +12,10 @@ if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
exit 1
fi
-model_dir=../paddle_asr_model
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
- pp-model-test \
+ ds2_model_test_main \
--model_path=$model_dir/avg_1.jit.pdmodel \
- --param_path=$model_dir/avg_1.jit.pdparams
\ No newline at end of file
+ --param_path=$model_dir/avg_1.jit.pdparams
diff --git a/speechx/examples/custom_asr/run.sh b/speechx/examples/custom_asr/run.sh
index 8d88000dcb79f07ff86eecf697d11097ec810bda..dddcf9fd1091a1f6208243f19171aa8923c8777d 100644
--- a/speechx/examples/custom_asr/run.sh
+++ b/speechx/examples/custom_asr/run.sh
@@ -7,7 +7,7 @@ export GLOG_logtostderr=1
. ./path.sh || exit 1;
# ds2 means deepspeech2 (acoutic model type)
-dir=$PWD/ds2_graph_with_slot
+dir=$PWD/exp/ds2_graph_with_slot
data=$PWD/data
stage=0
stop_stage=10
@@ -80,9 +80,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--word_symbol_table=$graph/words.txt \
--graph_path=$graph/TLG.fst --max_active=7500 \
--acoustic_scale=12 \
- --result_wspecifier=ark,t:./result_run.txt
+ --result_wspecifier=ark,t:./exp/result_run.txt
# the data/wav.trans is the label.
- utils/compute-wer.py --char=1 --v=1 data/wav.trans result_run.txt > wer_run
- tail -n 7 wer_run
+ utils/compute-wer.py --char=1 --v=1 data/wav.trans exp/result_run.txt > exp/wer_run
+ tail -n 7 exp/wer_run
fi
diff --git a/speechx/examples/dev/glog/CMakeLists.txt b/speechx/examples/dev/glog/CMakeLists.txt
deleted file mode 100644
index b4b0e6358b4ca7c2ac27056a34d1d0b9ed1b6399..0000000000000000000000000000000000000000
--- a/speechx/examples/dev/glog/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-add_executable(glog_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_test.cc)
-target_link_libraries(glog_test glog)
-
-
-add_executable(glog_logtostderr_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_logtostderr_test.cc)
-target_link_libraries(glog_logtostderr_test glog)
\ No newline at end of file
diff --git a/speechx/examples/dev/glog/run.sh b/speechx/examples/dev/glog/run.sh
deleted file mode 100755
index d3fcdb643902201ff1f354c15cb8d5a26b9f9fb4..0000000000000000000000000000000000000000
--- a/speechx/examples/dev/glog/run.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-set +x
-set -e
-
-. ./path.sh
-
-# 1. compile
-if [ ! -d ${SPEECHX_EXAMPLES} ]; then
- pushd ${SPEECHX_ROOT}
- bash build.sh
- popd
-fi
-
-# 2. run
-glog_test
-
-echo "------"
-export FLAGS_logtostderr=1
-glog_test
-
-echo "------"
-glog_logtostderr_test
diff --git a/speechx/examples/ds2_ol/CMakeLists.txt b/speechx/examples/ds2_ol/CMakeLists.txt
deleted file mode 100644
index 08c19484626597fc5a42f2afae8a5b7ddd14a55d..0000000000000000000000000000000000000000
--- a/speechx/examples/ds2_ol/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-add_subdirectory(feat)
-add_subdirectory(nnet)
-add_subdirectory(decoder)
-add_subdirectory(websocket)
diff --git a/speechx/examples/ds2_ol/aishell/README.md b/speechx/examples/ds2_ol/aishell/README.md
index 1ed8a67c2f682a36ab652ba74aa7e017cd2d4ebb..3e7af9244e23b591be594bf65b36e877031003f8 100644
--- a/speechx/examples/ds2_ol/aishell/README.md
+++ b/speechx/examples/ds2_ol/aishell/README.md
@@ -42,3 +42,40 @@ Overall -> 10.93 % N=104765 C=93410 S=9780 D=1575 I=95
Mandarin -> 10.93 % N=104762 C=93410 S=9779 D=1573 I=95
Other -> 100.00 % N=3 C=0 S=1 D=2 I=0
```
+
+## fbank
+```
+bash run_fbank.sh
+```
+
+### CTC Prefix Beam Search w/o LM
+
+```
+Overall -> 10.44 % N=104765 C=94194 S=10174 D=397 I=369
+Mandarin -> 10.44 % N=104762 C=94194 S=10171 D=397 I=369
+Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
+```
+
+### CTC Prefix Beam Search w/ LM
+
+LM: zh_giga.no_cna_cmn.prune01244.klm
+
+```
+Overall -> 5.82 % N=104765 C=99386 S=4944 D=435 I=720
+Mandarin -> 5.82 % N=104762 C=99386 S=4941 D=435 I=720
+English -> 0.00 % N=0 C=0 S=0 D=0 I=0
+```
+
+### CTC WFST
+
+LM: [aishell train](https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph2.zip)
+```
+Overall -> 9.58 % N=104765 C=94817 S=4326 D=5622 I=84
+Mandarin -> 9.57 % N=104762 C=94817 S=4325 D=5620 I=84
+Other -> 100.00 % N=3 C=0 S=1 D=2 I=0
+```
+
+## build TLG graph
+```
+ bash run_build_tlg.sh
+```
diff --git a/speechx/examples/ngram/zh/local/aishell_train_lms.sh b/speechx/examples/ds2_ol/aishell/local/aishell_train_lms.sh
similarity index 100%
rename from speechx/examples/ngram/zh/local/aishell_train_lms.sh
rename to speechx/examples/ds2_ol/aishell/local/aishell_train_lms.sh
diff --git a/speechx/examples/ds2_ol/aishell/path.sh b/speechx/examples/ds2_ol/aishell/path.sh
index 520129eafcfc469c09022db359345488034325bf..69c78e7463bbf6773882473dcb1262b8861afec3 100755
--- a/speechx/examples/ds2_ol/aishell/path.sh
+++ b/speechx/examples/ds2_ol/aishell/path.sh
@@ -1,14 +1,24 @@
# This contains the locations of binarys build required for running the examples.
-SPEECHX_ROOT=$PWD/../../..
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+MAIN_ROOT=`realpath $PWD/../../../../`
+SPEECHX_ROOT=$PWD/../../../
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
export LC_AL=C
-SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat:$SPEECHX_EXAMPLES/ds2_ol/websocket
-export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
+# openfst bin & kaldi bin
+KALDI_DIR=$SPEECHX_ROOT/build/speechx/kaldi/
+OPENFST_DIR=$SPEECHX_ROOT/fc_patch/openfst-build/src
+
+# srilm
+export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
+export SRILM=${MAIN_ROOT}/tools/srilm
+
+SPEECHX_BIN=$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/websocket
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN:${SRILM}/bin:${SRILM}/bin/i686-m64:$KALDI_DIR/lmbin:$KALDI_DIR/fstbin:$OPENFST_DIR/bin
diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh
index 650cb14090eea9b33e11d24c6d1951033f3e4c78..e1001e250dd13f3b4b5275b285b908199b612c57 100755
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -69,12 +69,12 @@ export GLOG_logtostderr=1
cmvn=$data/cmvn.ark
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# 3. gen linear feat
- cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
+ cmvn_json2kaldi_main --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \
- linear-spectrogram-wo-db-norm-ol \
+ compute_linear_spectrogram_main \
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
--cmvn_file=$cmvn \
@@ -85,7 +85,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# recognizer
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
- ctc-prefix-beam-search-decoder-ol \
+ ctc_prefix_beam_search_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \
@@ -102,7 +102,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# decode with lm
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
- ctc-prefix-beam-search-decoder-ol \
+ ctc_prefix_beam_search_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \
@@ -132,7 +132,7 @@ fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# TLG decoder
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
- wfst-decoder-ol \
+ tlg_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \
@@ -151,7 +151,7 @@ fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# TLG decoder
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.log \
- recognizer_test_main \
+ recognizer_main \
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--cmvn_file=$cmvn \
--model_path=$model_dir/avg_1.jit.pdmodel \
diff --git a/speechx/examples/ds2_ol/aishell/run_build_tlg.sh b/speechx/examples/ds2_ol/aishell/run_build_tlg.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2e148657bee1ba2e8067b2b31739b447ca619a49
--- /dev/null
+++ b/speechx/examples/ds2_ol/aishell/run_build_tlg.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+set -eo pipefail
+
+. path.sh
+
+# attention, please replace the vocab is only for this script.
+# different acustic model has different vocab
+ckpt_dir=data/fbank_model
+unit=$ckpt_dir/data/lang_char/vocab.txt # vocab file, line: char/spm_pice
+model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
+
+stage=-1
+stop_stage=100
+corpus=aishell
+lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
+text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
+
+. utils/parse_options.sh
+
+data=$PWD/data
+mkdir -p $data
+
+if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
+ if [ ! -f $data/speech.ngram.zh.tar.gz ];then
+ pushd $data
+ wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
+ tar xvzf speech.ngram.zh.tar.gz
+ popd
+ fi
+
+ if [ ! -f $ckpt_dir/data/mean_std.json ]; then
+ mkdir -p $ckpt_dir
+ pushd $ckpt_dir
+ wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/WIP1_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz
+ tar xzfv WIP1_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz
+ popd
+ fi
+fi
+
+if [ ! -f $unit ]; then
+ echo "$0: No such file $unit"
+ exit 1;
+fi
+
+if ! which ngram-count; then
+ pushd $MAIN_ROOT/tools
+ make srilm.done
+ popd
+fi
+
+mkdir -p data/local/dict
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # Prepare dict
+ # line: char/spm_pices
+ cp $unit data/local/dict/units.txt
+
+ if [ ! -f $lexicon ];then
+ utils/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
+ echo "Generate $lexicon from $text"
+ fi
+
+ # filter by vocab
+ # line: word ph0 ... phn -> line: word char0 ... charn
+ utils/fst/prepare_dict.py \
+ --unit_file $unit \
+ --in_lexicon ${lexicon} \
+ --out_lexicon data/local/dict/lexicon.txt
+fi
+
+lm=data/local/lm
+mkdir -p $lm
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # Train lm
+ cp $text $lm/text
+ local/aishell_train_lms.sh
+ echo "build LM done."
+fi
+
+# build TLG
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # build T & L
+ utils/fst/compile_lexicon_token_fst.sh \
+ data/local/dict data/local/tmp data/local/lang
+
+ # build G & TLG
+ utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
+
+fi
+
+aishell_wav_scp=aishell_test.scp
+nj=40
+cmvn=$data/cmvn_fbank.ark
+wfst=$data/lang_test
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+
+ if [ ! -d $data/test ]; then
+ pushd $data
+ wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
+ unzip aishell_test.zip
+ popd
+
+ realpath $data/test/*/*.wav > $data/wavlist
+ awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
+ paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
+ fi
+
+ ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+
+ cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
+fi
+
+wer=aishell_wer
+label_file=aishell_result
+export GLOG_logtostderr=1
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ # TLG decoder
+ utils/run.pl JOB=1:$nj $data/split${nj}/JOB/check_tlg.log \
+ recognizer_main \
+ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+ --cmvn_file=$cmvn \
+ --model_path=$model_dir/avg_5.jit.pdmodel \
+ --streaming_chunk=30 \
+ --use_fbank=true \
+ --param_path=$model_dir/avg_5.jit.pdiparams \
+ --word_symbol_table=$wfst/words.txt \
+ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+ --model_cache_shapes="5-1-2048,5-1-2048" \
+ --graph_path=$wfst/TLG.fst --max_active=7500 \
+ --acoustic_scale=1.2 \
+ --result_wspecifier=ark,t:$data/split${nj}/JOB/result_check_tlg
+
+ cat $data/split${nj}/*/result_check_tlg > $exp/${label_file}_check_tlg
+ utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_check_tlg > $exp/${wer}.check_tlg
+ echo "recognizer test have finished!!!"
+ echo "please checkout in ${exp}/${wer}.check_tlg"
+fi
+
+exit 0
diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh
index 3d4825ace078d2720fee9b6b9b73cac5ab71492a..6e1316774ac9352d22eda2eb40026802d8c568ae 100755
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@@ -69,7 +69,7 @@ export GLOG_logtostderr=1
cmvn=$data/cmvn_fbank.ark
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# 3. gen linear feat
- cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn --binary=false
+ cmvn_json2kaldi_main --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn --binary=false
./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
@@ -84,7 +84,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# recognizer
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wolm.log \
- ctc-prefix-beam-search-decoder-ol \
+ ctc_prefix_beam_search_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
--model_path=$model_dir/avg_5.jit.pdmodel \
--param_path=$model_dir/avg_5.jit.pdiparams \
@@ -100,12 +100,12 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# decode with lm
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.lm.log \
- ctc-prefix-beam-search-decoder-ol \
+ ctc_prefix_beam_search_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
--model_path=$model_dir/avg_5.jit.pdmodel \
--param_path=$model_dir/avg_5.jit.pdiparams \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
- --model_cache_shapes="5-1-2048,5-1-2048" \
+ --model_cache_shapes="5-1-2048,5-1-2048" \
--dict_file=$vocb_dir/vocab.txt \
--lm_path=$lm \
--result_wspecifier=ark,t:$data/split${nj}/JOB/fbank_result_lm
@@ -129,13 +129,13 @@ fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# TLG decoder
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wfst.log \
- wfst-decoder-ol \
+ tlg_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
--model_path=$model_dir/avg_5.jit.pdmodel \
--param_path=$model_dir/avg_5.jit.pdiparams \
--word_symbol_table=$wfst/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
- --model_cache_shapes="5-1-2048,5-1-2048" \
+ --model_cache_shapes="5-1-2048,5-1-2048" \
--graph_path=$wfst/TLG.fst --max_active=7500 \
--acoustic_scale=1.2 \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
@@ -148,13 +148,12 @@ fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/fbank_recognizer.log \
- recognizer_test_main \
+ recognizer_main \
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--cmvn_file=$cmvn \
--model_path=$model_dir/avg_5.jit.pdmodel \
--streaming_chunk=30 \
--use_fbank=true \
- --to_float32=false \
--param_path=$model_dir/avg_5.jit.pdiparams \
--word_symbol_table=$wfst/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
diff --git a/speechx/examples/ds2_ol/decoder/CMakeLists.txt b/speechx/examples/ds2_ol/decoder/CMakeLists.txt
deleted file mode 100644
index 62dd6862e267e7e3e5a737657c2086e113c85919..0000000000000000000000000000000000000000
--- a/speechx/examples/ds2_ol/decoder/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-set(bin_name ctc-prefix-beam-search-decoder-ol)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
-
-
-set(bin_name wfst-decoder-ol)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder ${DEPS})
-
-
-set(bin_name nnet-logprob-decoder-test)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
-
-add_executable(recognizer_test_main ${CMAKE_CURRENT_SOURCE_DIR}/recognizer_test_main.cc)
-target_include_directories(recognizer_test_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(recognizer_test_main PUBLIC frontend kaldi-feat-common nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder ${DEPS})
diff --git a/speechx/examples/ds2_ol/decoder/local/model.sh b/speechx/examples/ds2_ol/decoder/local/model.sh
deleted file mode 100644
index 5c609a6cf45715106f745dbc2755775734e35558..0000000000000000000000000000000000000000
--- a/speechx/examples/ds2_ol/decoder/local/model.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-
diff --git a/speechx/examples/ds2_ol/decoder/path.sh b/speechx/examples/ds2_ol/decoder/path.sh
deleted file mode 100644
index 8e26e6e7eef1421827c92bda7b4b5679677de8c9..0000000000000000000000000000000000000000
--- a/speechx/examples/ds2_ol/decoder/path.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-# This contains the locations of binarys build required for running the examples.
-
-SPEECHX_ROOT=$PWD/../../../
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
-
-SPEECHX_TOOLS=$SPEECHX_ROOT/tools
-TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
-
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
-
-export LC_AL=C
-
-SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat
-export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/ds2_ol/feat/.gitignore b/speechx/examples/ds2_ol/feat/.gitignore
deleted file mode 100644
index 566f2d97bf19b5e2e3dfe7c71d2905f40bec2781..0000000000000000000000000000000000000000
--- a/speechx/examples/ds2_ol/feat/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-exp
-data
diff --git a/speechx/examples/ds2_ol/feat/CMakeLists.txt b/speechx/examples/ds2_ol/feat/CMakeLists.txt
deleted file mode 100644
index 632f22e85897f055235735f53c7b8f0a993e6e5b..0000000000000000000000000000000000000000
--- a/speechx/examples/ds2_ol/feat/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-set(bin_name linear-spectrogram-wo-db-norm-ol)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} frontend kaldi-util kaldi-feat-common gflags glog)
-
-set(bin_name compute_fbank_main)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} frontend kaldi-util kaldi-feat-common gflags glog)
-
-set(bin_name cmvn-json2kaldi)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog)
diff --git a/speechx/examples/ds2_ol/nnet/path.sh b/speechx/examples/ds2_ol/nnet/path.sh
deleted file mode 100644
index 0ee8b4787ebd43dc26f1e3e6ca6e470a5e5036a5..0000000000000000000000000000000000000000
--- a/speechx/examples/ds2_ol/nnet/path.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-# This contains the locations of binarys build required for running the examples.
-
-SPEECHX_ROOT=$PWD/../../../
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
-
-SPEECHX_TOOLS=$SPEECHX_ROOT/tools
-TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
-
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
-
-export LC_AL=C
-
-SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/nnet
-export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/ds2_ol/websocket/CMakeLists.txt b/speechx/examples/ds2_ol/websocket/CMakeLists.txt
deleted file mode 100644
index ed542aad07b729f5c0797a10a5733cbb7c1bc7f6..0000000000000000000000000000000000000000
--- a/speechx/examples/ds2_ol/websocket/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-add_executable(websocket_server_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_server_main.cc)
-target_include_directories(websocket_server_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(websocket_server_main PUBLIC frontend kaldi-feat-common nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder websocket ${DEPS})
-
-add_executable(websocket_client_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_client_main.cc)
-target_include_directories(websocket_client_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(websocket_client_main PUBLIC frontend kaldi-feat-common nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder websocket ${DEPS})
\ No newline at end of file
diff --git a/speechx/examples/ds2_ol/websocket/path.sh b/speechx/examples/ds2_ol/websocket/path.sh
index d66b5dccea6fd44a9180020f6b557bfcf89c7875..d25e88a2764b1d1cef43675d86f8907248ad2018 100755
--- a/speechx/examples/ds2_ol/websocket/path.sh
+++ b/speechx/examples/ds2_ol/websocket/path.sh
@@ -1,14 +1,14 @@
# This contains the locations of binarys build required for running the examples.
-SPEECHX_ROOT=$PWD/../../..
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+SPEECHX_ROOT=$PWD/../../../
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
export LC_AL=C
-SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/websocket:$SPEECHX_EXAMPLES/ds2_ol/feat
+SPEECHX_BIN=$SPEECHX_BUILD/protocol/websocket
export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/ds2_ol/websocket/websocket_server.sh b/speechx/examples/ds2_ol/websocket/websocket_server.sh
index fc57e326fb8cc2491d2443738fb8552e052fd033..f798dfd41ac8c6f83fcf1d7847237caac341a463 100755
--- a/speechx/examples/ds2_ol/websocket/websocket_server.sh
+++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh
@@ -45,7 +45,7 @@ export GLOG_logtostderr=1
# 3. gen cmvn
cmvn=$data/cmvn.ark
-cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
+cmvn_json2kaldi_main --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
wfst=$data/wfst/
diff --git a/speechx/examples/ngram/.gitignore b/speechx/examples/ngram/.gitignore
deleted file mode 100644
index bbd86a25b018bc611bf6ae52cbb6afa5f60bce62..0000000000000000000000000000000000000000
--- a/speechx/examples/ngram/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-data
-exp
diff --git a/speechx/examples/ngram/en/README.md b/speechx/examples/ngram/en/README.md
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/speechx/examples/ngram/zh/README.md b/speechx/examples/ngram/zh/README.md
deleted file mode 100644
index e11bd3439618e0d2dc92b1b30453a8a4eb5f976c..0000000000000000000000000000000000000000
--- a/speechx/examples/ngram/zh/README.md
+++ /dev/null
@@ -1,101 +0,0 @@
-# ngram train for mandarin
-
-Quick run:
-```
-bash run.sh --stage -1
-```
-
-## input
-
-input files:
-```
-data/
-├── lexicon.txt
-├── text
-└── vocab.txt
-```
-
-```
-==> data/text <==
-BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
-BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
-BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
-BAC009S0002W0125 各地 政府 便 纷纷 跟进
-BAC009S0002W0126 仅 一 个 多 月 的 时间 里
-BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
-BAC009S0002W0128 四十六 个 限 购 城市 当中
-BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
-BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
-BAC009S0002W0131 显示 出 了 极 强 的 威力
-
-==> data/lexicon.txt <==
-SIL sil
- sil
-啊 aa a1
-啊 aa a2
-啊 aa a4
-啊 aa a5
-啊啊啊 aa a2 aa a2 aa a2
-啊啊啊 aa a5 aa a5 aa a5
-坐地 z uo4 d i4
-坐实 z uo4 sh ix2
-坐视 z uo4 sh ix4
-坐稳 z uo4 uu un3
-坐拥 z uo4 ii iong1
-坐诊 z uo4 zh en3
-坐庄 z uo4 zh uang1
-坐姿 z uo4 z iy1
-
-==> data/vocab.txt <==
-
-
-A
-B
-C
-D
-E
-龙
-龚
-龛
-
-```
-
-## output
-
-```
-data/
-├── local
-│ ├── dict
-│ │ ├── lexicon.txt
-│ │ └── units.txt
-│ └── lm
-│ ├── heldout
-│ ├── lm.arpa
-│ ├── text
-│ ├── text.no_oov
-│ ├── train
-│ ├── unigram.counts
-│ ├── word.counts
-│ └── wordlist
-```
-
-```
-/workspace/srilm/bin/i686-m64/ngram-count
-Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
-Ignoring words 矽, which contains oov unit
-Ignoring words 傩, which contains oov unit
-Ignoring words 堀, which contains oov unit
-Ignoring words 莼, which contains oov unit
-Ignoring words 菰, which contains oov unit
-Ignoring words 摭, which contains oov unit
-Ignoring words 帙, which contains oov unit
-Ignoring words 迨, which contains oov unit
-Ignoring words 孥, which contains oov unit
-Ignoring words 瑗, which contains oov unit
-...
-...
-...
-file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
-0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
-build LM done.
-```
diff --git a/speechx/examples/ngram/zh/local/split_data.sh b/speechx/examples/ngram/zh/local/split_data.sh
deleted file mode 100755
index 2af6fc5abdf632b00c6564987c345bd8d9594904..0000000000000000000000000000000000000000
--- a/speechx/examples/ngram/zh/local/split_data.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-
-set -eo pipefail
-
-data=$1
-scp=$2
-split_name=$3
-numsplit=$4
-
-# save in $data/split{n}
-# $scp to split
-#
-
-if [[ ! $numsplit -gt 0 ]]; then
- echo "Invalid num-split argument";
- exit 1;
-fi
-
-directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
-scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done)
-
-# if this mkdir fails due to argument-list being too long, iterate.
-if ! mkdir -p $directories >&/dev/null; then
- for n in `seq $numsplit`; do
- mkdir -p $data/split${numsplit}/$n
- done
-fi
-
-echo "utils/split_scp.pl $scp $scp_splits"
-utils/split_scp.pl $scp $scp_splits
diff --git a/speechx/examples/ngram/zh/path.sh b/speechx/examples/ngram/zh/path.sh
deleted file mode 100644
index a3fb3d75878a3b7a641d7f13e464aa324a9b0ea6..0000000000000000000000000000000000000000
--- a/speechx/examples/ngram/zh/path.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-# This contains the locations of binarys build required for running the examples.
-
-MAIN_ROOT=`realpath $PWD/../../../../`
-SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
-
-export LC_AL=C
-
-# srilm
-export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
-export SRILM=${MAIN_ROOT}/tools/srilm
-export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
diff --git a/speechx/examples/ngram/zh/run.sh b/speechx/examples/ngram/zh/run.sh
deleted file mode 100755
index f24ad0a7cc3d605d7a3af83d4d519f208abc3acf..0000000000000000000000000000000000000000
--- a/speechx/examples/ngram/zh/run.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-set -eo pipefail
-
-. path.sh
-
-stage=-1
-stop_stage=100
-corpus=aishell
-
-unit=data/vocab.txt # vocab file, line: char/spm_pice
-lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
-text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
-
-. utils/parse_options.sh
-
-data=$PWD/data
-mkdir -p $data
-
-if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
- if [ ! -f $data/speech.ngram.zh.tar.gz ];then
- pushd $data
- wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
- tar xvzf speech.ngram.zh.tar.gz
- popd
- fi
-fi
-
-if [ ! -f $unit ]; then
- echo "$0: No such file $unit"
- exit 1;
-fi
-
-if ! which ngram-count; then
- pushd $MAIN_ROOT/tools
- make srilm.done
- popd
-fi
-
-mkdir -p data/local/dict
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- # 7.1 Prepare dict
- # line: char/spm_pices
- cp $unit data/local/dict/units.txt
-
- if [ ! -f $lexicon ];then
- local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
- echo "Generate $lexicon from $text"
- fi
-
- # filter by vocab
- # line: word ph0 ... phn -> line: word char0 ... charn
- utils/fst/prepare_dict.py \
- --unit_file $unit \
- --in_lexicon ${lexicon} \
- --out_lexicon data/local/dict/lexicon.txt
-fi
-
-lm=data/local/lm
-mkdir -p $lm
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # 7.2 Train lm
- cp $text $lm/text
- local/aishell_train_lms.sh
-fi
-
-echo "build LM done."
-exit 0
diff --git a/speechx/examples/ngram/zh/utils b/speechx/examples/ngram/zh/utils
deleted file mode 120000
index c2519a9dd0bc11c4ca3de4bf89e16634dadcd4c9..0000000000000000000000000000000000000000
--- a/speechx/examples/ngram/zh/utils
+++ /dev/null
@@ -1 +0,0 @@
-../../../../utils/
\ No newline at end of file
diff --git a/speechx/examples/wfst/.gitignore b/speechx/examples/wfst/.gitignore
deleted file mode 100644
index 1269488f7fb1f4b56a8c0e5eb48cecbfadfa9219..0000000000000000000000000000000000000000
--- a/speechx/examples/wfst/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-data
diff --git a/speechx/examples/wfst/README.md b/speechx/examples/wfst/README.md
deleted file mode 100644
index d0bdac0fc8e69be88d963b12d8834d3c1d6a588e..0000000000000000000000000000000000000000
--- a/speechx/examples/wfst/README.md
+++ /dev/null
@@ -1,186 +0,0 @@
-# Built TLG wfst
-
-## Input
-```
-data/local/
-├── dict
-│ ├── lexicon.txt
-│ └── units.txt
-└── lm
- ├── heldout
- ├── lm.arpa
- ├── text
- ├── text.no_oov
- ├── train
- ├── unigram.counts
- ├── word.counts
- └── wordlist
-```
-
-```
-==> data/local/dict/lexicon.txt <==
-啊 啊
-啊啊啊 啊 啊 啊
-阿 阿
-阿尔 阿 尔
-阿根廷 阿 根 廷
-阿九 阿 九
-阿克 阿 克
-阿拉伯数字 阿 拉 伯 数 字
-阿拉法特 阿 拉 法 特
-阿拉木图 阿 拉 木 图
-
-==> data/local/dict/units.txt <==
-
-
-A
-B
-C
-D
-E
-F
-G
-H
-
-==> data/local/lm/heldout <==
-而 对 楼市 成交 抑制 作用 最 大 的 限 购
-也 成为 地方 政府 的 眼中 钉
-自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
-各地 政府 便 纷纷 跟进
-仅 一 个 多 月 的 时间 里
-除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
-四十六 个 限 购 城市 当中
-四十一 个 已 正式 取消 或 变相 放松 了 限 购
-财政 金融 政策 紧随 其后 而来
-显示 出 了 极 强 的 威力
-
-==> data/local/lm/lm.arpa <==
-
-\data\
-ngram 1=129356
-ngram 2=504661
-ngram 3=123455
-
-\1-grams:
--1.531278
--3.828829 -0.1600094
--6.157292
-
-==> data/local/lm/text <==
-BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
-BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
-BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
-BAC009S0002W0125 各地 政府 便 纷纷 跟进
-BAC009S0002W0126 仅 一 个 多 月 的 时间 里
-BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
-BAC009S0002W0128 四十六 个 限 购 城市 当中
-BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
-BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
-BAC009S0002W0131 显示 出 了 极 强 的 威力
-
-==> data/local/lm/text.no_oov <==
- 而 对 楼市 成交 抑制 作用 最 大 的 限 购
- 也 成为 地方 政府 的 眼中 钉
- 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
- 各地 政府 便 纷纷 跟进
- 仅 一 个 多 月 的 时间 里
- 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
- 四十六 个 限 购 城市 当中
- 四十一 个 已 正式 取消 或 变相 放松 了 限 购
- 财政 ���融 政策 紧随 其后 而来
- 显示 出 了 极 强 的 威力
-
-==> data/local/lm/train <==
-汉莎 不 得 不 通过 这样 的 方式 寻求 新 的 发展 点
-并 计划 朝云 计算 方面 发展
-汉莎 的 基础 设施 部门 拥有 一千四百 名 员工
-媒体 就 曾 披露 这笔 交易
-虽然 双方 已经 正式 签署 了 外包 协议
-但是 这笔 交易 还 需要 得到 反 垄断 部门 的 批准
-陈 黎明 一九八九 年 获得 美国 康乃尔 大学 硕士 学位
-并 于 二零零三 年 顺利 完成 美国 哈佛 商学 院 高级 管理 课程
-曾 在 多家 国际 公司 任职
-拥有 业务 开发 商务 及 企业 治理
-
-==> data/local/lm/unigram.counts <==
- 57487 的
- 13099 在
- 11862 一
- 11397 了
- 10998 不
- 9913 是
- 7952 有
- 6250 和
- 6152 个
- 5422 将
-
-==> data/local/lm/word.counts <==
- 57486 的
- 13098 在
- 11861 一
- 11396 了
- 10997 不
- 9912 是
- 7951 有
- 6249 和
- 6151 个
- 5421 将
-
-==> data/local/lm/wordlist <==
-的
-在
-一
-了
-不
-是
-有
-和
-个
-将
-```
-
-## Output
-
-```
-fstaddselfloops 'echo 4234 |' 'echo 123660 |'
-Lexicon and Token FSTs compiling succeeded
-arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true -
-LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section.
-LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section.
-LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section.
-LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section.
-Checking how stochastic G is (the first of these numbers should be small):
-fstisstochastic data/lang_test/G.fst
-0 -1.14386
-fsttablecompose data/lang_test/L.fst data/lang_test/G.fst
-fstminimizeencoded
-fstdeterminizestar --use-log=true
-fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst
-Composing decoding graph TLG.fst succeeded
-Aishell build TLG done.
-```
-
-```
-data/
-├── lang_test
-│ ├── G.fst
-│ ├── L.fst
-│ ├── LG.fst
-│ ├── T.fst
-│ ├── TLG.fst
-│ ├── tokens.txt
-│ ├── units.txt
-│ └── words.txt
-└── local
- ├── lang
- │ ├── L.fst
- │ ├── T.fst
- │ ├── tokens.txt
- │ ├── units.txt
- │ └── words.txt
- └── tmp
- ├── disambig.list
- ├── lexiconp_disambig.txt
- ├── lexiconp.txt
- └── units.list
-```
diff --git a/speechx/examples/wfst/path.sh b/speechx/examples/wfst/path.sh
deleted file mode 100644
index a07c1297de8c54daf5f78d6b5deb0d87de56988f..0000000000000000000000000000000000000000
--- a/speechx/examples/wfst/path.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-# This contains the locations of binarys build required for running the examples.
-
-MAIN_ROOT=`realpath $PWD/../../../`
-SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
-
-export LC_AL=C
-
-# srilm
-export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
-export SRILM=${MAIN_ROOT}/tools/srilm
-export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
-
-# Kaldi
-export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
-[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
-export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
-[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
-[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
diff --git a/speechx/examples/wfst/run.sh b/speechx/examples/wfst/run.sh
deleted file mode 100755
index 1354646af1050fe34f08883ef8abc82038c66e7e..0000000000000000000000000000000000000000
--- a/speechx/examples/wfst/run.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-set -eo pipefail
-
-. path.sh
-
-stage=-1
-stop_stage=100
-
-. utils/parse_options.sh
-
-if ! which fstprint ; then
- pushd $MAIN_ROOT/tools
- make kaldi.done
- popd
-fi
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- # build T & L
- # utils/fst/compile_lexicon_token_fst.sh
- utils/fst/compile_lexicon_token_fst.sh \
- data/local/dict data/local/tmp data/local/lang
-
- # build G & LG & TLG
- # utils/fst/make_tlg.sh
- utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
-fi
-
-echo "build TLG done."
-exit 0
diff --git a/speechx/examples/wfst/utils b/speechx/examples/wfst/utils
deleted file mode 120000
index 256f914abcaa47d966c44878b88a300437f110fb..0000000000000000000000000000000000000000
--- a/speechx/examples/wfst/utils
+++ /dev/null
@@ -1 +0,0 @@
-../../../utils/
\ No newline at end of file
diff --git a/speechx/speechx/CMakeLists.txt b/speechx/speechx/CMakeLists.txt
index b4da095d8b071604eb9b173b9a87b03f101d60f7..c8e21d4867d615b6005be11e4175e6f6e24aaed1 100644
--- a/speechx/speechx/CMakeLists.txt
+++ b/speechx/speechx/CMakeLists.txt
@@ -34,6 +34,12 @@ add_subdirectory(decoder)
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}
-${CMAKE_CURRENT_SOURCE_DIR}/websocket
+${CMAKE_CURRENT_SOURCE_DIR}/protocol
)
-add_subdirectory(websocket)
+add_subdirectory(protocol)
+
+include_directories(
+${CMAKE_CURRENT_SOURCE_DIR}
+${CMAKE_CURRENT_SOURCE_DIR}/codelab
+)
+add_subdirectory(codelab)
diff --git a/speechx/examples/dev/CMakeLists.txt b/speechx/speechx/codelab/CMakeLists.txt
similarity index 76%
rename from speechx/examples/dev/CMakeLists.txt
rename to speechx/speechx/codelab/CMakeLists.txt
index c8445fb821f170cd998342f480508b1e33911254..95043263713a7b3734b16ba2f733bd399673a820 100644
--- a/speechx/examples/dev/CMakeLists.txt
+++ b/speechx/speechx/codelab/CMakeLists.txt
@@ -1,3 +1,4 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
add_subdirectory(glog)
+add_subdirectory(nnet)
diff --git a/speechx/speechx/codelab/README.md b/speechx/speechx/codelab/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..aee60de67e7090d9b1e5eb8ac8a743cff33bfa2e
--- /dev/null
+++ b/speechx/speechx/codelab/README.md
@@ -0,0 +1,7 @@
+
+## For Developer
+
+> Reminder: Only for developer.
+
+* codelab - for speechx developer, using for test.
+
diff --git a/speechx/speechx/codelab/glog/CMakeLists.txt b/speechx/speechx/codelab/glog/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..08a98641f61f54a4a7cc04ebfc92f55feb6000be
--- /dev/null
+++ b/speechx/speechx/codelab/glog/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+add_executable(glog_main ${CMAKE_CURRENT_SOURCE_DIR}/glog_main.cc)
+target_link_libraries(glog_main glog)
+
+
+add_executable(glog_logtostderr_main ${CMAKE_CURRENT_SOURCE_DIR}/glog_logtostderr_main.cc)
+target_link_libraries(glog_logtostderr_main glog)
diff --git a/speechx/examples/dev/glog/README.md b/speechx/speechx/codelab/glog/README.md
similarity index 92%
rename from speechx/examples/dev/glog/README.md
rename to speechx/speechx/codelab/glog/README.md
index 996e192e9abaf6733746b6ea06561da6be627f91..3282c920dcbb83682ead047f49141da83f65c32a 100644
--- a/speechx/examples/dev/glog/README.md
+++ b/speechx/speechx/codelab/glog/README.md
@@ -23,3 +23,16 @@ You can also modify flag values in your program by modifying global variables `F
FLAGS_log_dir = "/some/log/directory";
LOG(INFO) << "the same file";
```
+
+* this is the test script:
+```
+# run
+glog_test
+
+echo "------"
+export FLAGS_logtostderr=1
+glog_test
+
+echo "------"
+glog_logtostderr_test
+```
diff --git a/speechx/examples/dev/glog/glog_logtostderr_test.cc b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc
similarity index 100%
rename from speechx/examples/dev/glog/glog_logtostderr_test.cc
rename to speechx/speechx/codelab/glog/glog_logtostderr_main.cc
diff --git a/speechx/examples/dev/glog/glog_test.cc b/speechx/speechx/codelab/glog/glog_main.cc
similarity index 100%
rename from speechx/examples/dev/glog/glog_test.cc
rename to speechx/speechx/codelab/glog/glog_main.cc
diff --git a/speechx/examples/ds2_ol/nnet/CMakeLists.txt b/speechx/speechx/codelab/nnet/CMakeLists.txt
similarity index 87%
rename from speechx/examples/ds2_ol/nnet/CMakeLists.txt
rename to speechx/speechx/codelab/nnet/CMakeLists.txt
index 6745a51ae97fe227973baeaf9e4805490bbb4610..dcad8a9c65ed414d2d7ab6a3a4761335db88569e 100644
--- a/speechx/examples/ds2_ol/nnet/CMakeLists.txt
+++ b/speechx/speechx/codelab/nnet/CMakeLists.txt
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-set(bin_name ds2-model-ol-test)
+set(bin_name ds2_model_test_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} PUBLIC nnet gflags glog ${DEPS})
\ No newline at end of file
+target_link_libraries(${bin_name} PUBLIC nnet gflags glog ${DEPS})
diff --git a/speechx/examples/ds2_ol/nnet/ds2-model-ol-test.cc b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
similarity index 100%
rename from speechx/examples/ds2_ol/nnet/ds2-model-ol-test.cc
rename to speechx/speechx/codelab/nnet/ds2_model_test_main.cc
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index 06bf4020ff17c412aeda28eb09da50d479669184..1df935112e641abe6deaafb8f57b2cb635075c00 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -10,3 +10,16 @@ add_library(decoder STATIC
recognizer.cc
)
target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder)
+
+set(BINS
+ ctc_prefix_beam_search_decoder_main
+ nnet_logprob_decoder_main
+ recognizer_main
+ tlg_decoder_main
+)
+
+foreach(bin_name IN LISTS BINS)
+ add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+ target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+ target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
+endforeach()
diff --git a/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
similarity index 100%
rename from speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
rename to speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.cc b/speechx/speechx/decoder/ctc_tlg_decoder.cc
index 02e6431658a453b70c23435fdb22b4ac9fc034d5..3f8bdd5a7e18cefa3a4b1956b85546eca5ec9f18 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.cc
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc
@@ -47,6 +47,26 @@ void TLGDecoder::Reset() {
return;
}
+std::string TLGDecoder::GetPartialResult() {
+ if (frame_decoded_size_ == 0) {
+ // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call
+ // BestPathEnd if no frames were decoded.")
+ return std::string("");
+ }
+ kaldi::Lattice lat;
+ kaldi::LatticeWeight weight;
+ std::vector alignment;
+ std::vector words_id;
+ decoder_->GetBestPath(&lat, false);
+ fst::GetLinearSymbolSequence(lat, &alignment, &words_id, &weight);
+ std::string words;
+ for (int32 idx = 0; idx < words_id.size(); ++idx) {
+ std::string word = word_symbol_table_->Find(words_id[idx]);
+ words += word;
+ }
+ return words;
+}
+
std::string TLGDecoder::GetFinalBestPath() {
if (frame_decoded_size_ == 0) {
// Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h
index 361c44af5b75d704d91c0083399f543729840a7f..1ac46ac640140a3a38bc790530eb575688406683 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.h
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.h
@@ -38,6 +38,7 @@ class TLGDecoder {
std::string GetBestPath();
std::vector> GetNBestPath();
std::string GetFinalBestPath();
+ std::string GetPartialResult();
int NumFrameDecoded();
int DecodeLikelihoods(const std::vector>& probs,
std::vector& nbest_words);
diff --git a/speechx/examples/ds2_ol/decoder/nnet-logprob-decoder-test.cc b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc
similarity index 100%
rename from speechx/examples/ds2_ol/decoder/nnet-logprob-decoder-test.cc
rename to speechx/speechx/decoder/nnet_logprob_decoder_main.cc
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index b2bf1890a41c5d831d5fc777bbec7aea93e1eb0e..495e5236c6f20a0f219d75d6e39a82067ea4b445 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -81,8 +81,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
frame_opts.preemph_coeff = 0.0;
opts.linear_spectrogram_opts.frame_opts = frame_opts;
}
- opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length;
- opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate;
+ opts.assembler_opts.frame_chunk_size = FLAGS_receptive_field_length;
+ opts.assembler_opts.frame_chunk_stride = FLAGS_downsampling_rate;
return opts;
}
@@ -115,4 +115,4 @@ RecognizerResource InitRecognizerResoure() {
resource.tlg_opts = InitDecoderOptions();
return resource;
}
-}
\ No newline at end of file
+}
diff --git a/speechx/speechx/decoder/recognizer.cc b/speechx/speechx/decoder/recognizer.cc
index 2c90ada99e92ed61d922c640fddb2bbe3a7d0ec4..44c3911c92def5d7f9ba8b79336880fdae4bea81 100644
--- a/speechx/speechx/decoder/recognizer.cc
+++ b/speechx/speechx/decoder/recognizer.cc
@@ -44,6 +44,10 @@ std::string Recognizer::GetFinalResult() {
return decoder_->GetFinalBestPath();
}
+std::string Recognizer::GetPartialResult() {
+ return decoder_->GetPartialResult();
+}
+
void Recognizer::SetFinished() {
feature_pipeline_->SetFinished();
input_finished_ = true;
diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/decoder/recognizer.h
index 9a7e7d11eb39989b2d50fc935f512cbf9a40361b..35e1e1676d1836bbfbbe9599a919a86ada09c613 100644
--- a/speechx/speechx/decoder/recognizer.h
+++ b/speechx/speechx/decoder/recognizer.h
@@ -43,6 +43,7 @@ class Recognizer {
void Accept(const kaldi::Vector& waves);
void Decode();
std::string GetFinalResult();
+ std::string GetPartialResult();
void SetFinished();
bool IsFinished();
void Reset();
diff --git a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc b/speechx/speechx/decoder/recognizer_main.cc
similarity index 100%
rename from speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
rename to speechx/speechx/decoder/recognizer_main.cc
diff --git a/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc b/speechx/speechx/decoder/tlg_decoder_main.cc
similarity index 100%
rename from speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc
rename to speechx/speechx/decoder/tlg_decoder_main.cc
diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt
index 745832fe7c79cfd87f94a53b7c00e594a6ce3e0f..8ae63256a47991a0d7ed8c23cde11f417034ba68 100644
--- a/speechx/speechx/frontend/audio/CMakeLists.txt
+++ b/speechx/speechx/frontend/audio/CMakeLists.txt
@@ -8,6 +8,24 @@ add_library(frontend STATIC
feature_cache.cc
feature_pipeline.cc
fbank.cc
+ assembler.cc
)
-
target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common kaldi-fbank)
+
+
+
+set(bin_name cmvn_json2kaldi_main)
+add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog)
+
+set(BINS
+ compute_linear_spectrogram_main
+ compute_fbank_main
+)
+
+foreach(bin_name IN LISTS BINS)
+ add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+ target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+ target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog)
+endforeach()
diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..47e0705b9561650cbe23f7f1022cbbc4f14e7674
--- /dev/null
+++ b/speechx/speechx/frontend/audio/assembler.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "frontend/audio/assembler.h"
+
+namespace ppspeech {
+
+using kaldi::Vector;
+using kaldi::VectorBase;
+using kaldi::BaseFloat;
+using std::unique_ptr;
+
+Assembler::Assembler(AssemblerOptions opts,
+ unique_ptr base_extractor) {
+ frame_chunk_stride_ = opts.frame_chunk_stride;
+ frame_chunk_size_ = opts.frame_chunk_size;
+ base_extractor_ = std::move(base_extractor);
+ dim_ = base_extractor_->Dim();
+}
+
+void Assembler::Accept(const kaldi::VectorBase& inputs) {
+ // read inputs
+ base_extractor_->Accept(inputs);
+}
+
+// pop feature chunk
+bool Assembler::Read(kaldi::Vector* feats) {
+ feats->Resize(dim_ * frame_chunk_size_);
+ bool result = Compute(feats);
+ return result;
+}
+
+// read all data from base_feature_extractor_ into cache_
+bool Assembler::Compute(Vector* feats) {
+ // compute and feed
+ bool result = false;
+ while (feature_cache_.size() < frame_chunk_size_) {
+ Vector feature;
+ result = base_extractor_->Read(&feature);
+ if (result == false || feature.Dim() == 0) return false;
+ feature_cache_.push(feature);
+ }
+
+ int32 counter = 0;
+ int32 cache_size = frame_chunk_size_ - frame_chunk_stride_;
+ int32 elem_dim = base_extractor_->Dim();
+ while (counter < frame_chunk_size_) {
+ Vector& val = feature_cache_.front();
+ int32 start = counter * elem_dim;
+ feats->Range(start, elem_dim).CopyFromVec(val);
+ if (frame_chunk_size_ - counter <= cache_size ) {
+ feature_cache_.push(val);
+ }
+ feature_cache_.pop();
+ counter++;
+ }
+
+ return result;
+}
+
+} // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/assembler.h b/speechx/speechx/frontend/audio/assembler.h
new file mode 100644
index 0000000000000000000000000000000000000000..4397d3f6da995bafc5bb6e37bb9e6c7d9ae6814b
--- /dev/null
+++ b/speechx/speechx/frontend/audio/assembler.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/common.h"
+#include "frontend/audio/frontend_itf.h"
+
+namespace ppspeech {
+
+struct AssemblerOptions {
+ int32 frame_chunk_size;
+ int32 frame_chunk_stride;
+
+ AssemblerOptions()
+ : frame_chunk_size(1),
+ frame_chunk_stride(1) {}
+};
+
+class Assembler : public FrontendInterface {
+ public:
+ explicit Assembler(
+ AssemblerOptions opts,
+ std::unique_ptr base_extractor = NULL);
+
+ // Feed feats or waves
+ virtual void Accept(const kaldi::VectorBase& inputs);
+
+ // feats size = num_frames * feat_dim
+ virtual bool Read(kaldi::Vector* feats);
+
+ // feat dim
+ virtual size_t Dim() const { return dim_; }
+
+ virtual void SetFinished() {
+ base_extractor_->SetFinished();
+ }
+
+ virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+
+ virtual void Reset() {
+ base_extractor_->Reset();
+ }
+
+ private:
+ bool Compute(kaldi::Vector* feats);
+
+ int32 dim_;
+ int32 frame_chunk_size_; // window
+ int32 frame_chunk_stride_; // stride
+ std::queue> feature_cache_;
+ std::unique_ptr base_extractor_;
+ DISALLOW_COPY_AND_ASSIGN(Assembler);
+};
+
+} // namespace ppspeech
diff --git a/speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
similarity index 100%
rename from speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc
rename to speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
diff --git a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc
similarity index 96%
rename from speechx/examples/ds2_ol/feat/compute_fbank_main.cc
rename to speechx/speechx/frontend/audio/compute_fbank_main.cc
index 67683eebf6e2fb3f73b6a44f6c9ac682c6c5cda7..18024719b6c2bf85c28050dabc41a4fcacf7d085 100644
--- a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc
+++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc
@@ -64,10 +64,6 @@ int main(int argc, char* argv[]) {
ppspeech::FeatureCacheOptions feat_cache_opts;
// the feature cache output feature chunk by chunk.
- // frame_chunk_size : num frame of a chunk.
- // frame_chunk_stride: chunk sliding window stride.
- feat_cache_opts.frame_chunk_stride = 1;
- feat_cache_opts.frame_chunk_size = 1;
ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
LOG(INFO) << "fbank: " << true;
LOG(INFO) << "feat dim: " << feature_cache.Dim();
diff --git a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
similarity index 96%
rename from speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
rename to speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
index bbf0e6908dddb0e7b1c776d9f42616fb70d92e81..cc7a5e17c8a16acbacb7581cb67f232d46260caa 100644
--- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
+++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// todo refactor, repalce with gtest
-
#include "base/flags.h"
#include "base/log.h"
#include "kaldi/feat/wave-reader.h"
@@ -68,10 +66,6 @@ int main(int argc, char* argv[]) {
ppspeech::FeatureCacheOptions feat_cache_opts;
// the feature cache output feature chunk by chunk.
- // frame_chunk_size : num frame of a chunk.
- // frame_chunk_stride: chunk sliding window stride.
- feat_cache_opts.frame_chunk_stride = 1;
- feat_cache_opts.frame_chunk_size = 1;
ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
LOG(INFO) << "feat dim: " << feature_cache.Dim();
diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc
index 05283bb7e51e863759db8b728ce54d5448c90a88..930f29c5497efec451ff8c6bfa608b2046c4ccb8 100644
--- a/speechx/speechx/frontend/audio/feature_cache.cc
+++ b/speechx/speechx/frontend/audio/feature_cache.cc
@@ -26,8 +26,6 @@ using std::unique_ptr;
FeatureCache::FeatureCache(FeatureCacheOptions opts,
unique_ptr base_extractor) {
max_size_ = opts.max_size;
- frame_chunk_stride_ = opts.frame_chunk_stride;
- frame_chunk_size_ = opts.frame_chunk_size;
timeout_ = opts.timeout; // ms
base_extractor_ = std::move(base_extractor);
dim_ = base_extractor_->Dim();
@@ -74,24 +72,11 @@ bool FeatureCache::Compute() {
bool result = base_extractor_->Read(&feature);
if (result == false || feature.Dim() == 0) return false;
- // join with remained
- int32 joint_len = feature.Dim() + remained_feature_.Dim();
- Vector joint_feature(joint_len);
- joint_feature.Range(0, remained_feature_.Dim())
- .CopyFromVec(remained_feature_);
- joint_feature.Range(remained_feature_.Dim(), feature.Dim())
- .CopyFromVec(feature);
-
- // one by one, or stride with window
- // controlled by frame_chunk_stride_ and frame_chunk_size_
- int32 num_chunk =
- ((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1;
+ int32 num_chunk = feature.Dim() / dim_ ;
for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
- int32 start = chunk_idx * frame_chunk_stride_ * dim_;
-
- Vector feature_chunk(frame_chunk_size_ * dim_);
- SubVector tmp(joint_feature.Data() + start,
- frame_chunk_size_ * dim_);
+ int32 start = chunk_idx * dim_;
+ Vector feature_chunk(dim_);
+ SubVector tmp(feature.Data() + start, dim_);
feature_chunk.CopyFromVec(tmp);
std::unique_lock lock(mutex_);
@@ -104,13 +89,6 @@ bool FeatureCache::Compute() {
cache_.push(feature_chunk);
ready_read_condition_.notify_one();
}
-
- // cache remained feats
- int32 remained_feature_len =
- joint_len - num_chunk * frame_chunk_stride_ * dim_;
- remained_feature_.Resize(remained_feature_len);
- remained_feature_.CopyFromVec(joint_feature.Range(
- frame_chunk_stride_ * num_chunk * dim_, remained_feature_len));
return result;
}
diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h
index 0dc704bbff9c268652d311571d33218b902b01cc..4c016056a8379d88742334eee2e071528be39e15 100644
--- a/speechx/speechx/frontend/audio/feature_cache.h
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@@ -21,13 +21,9 @@ namespace ppspeech {
struct FeatureCacheOptions {
int32 max_size;
- int32 frame_chunk_size;
- int32 frame_chunk_stride;
int32 timeout; // ms
FeatureCacheOptions()
: max_size(kint16max),
- frame_chunk_size(1),
- frame_chunk_stride(1),
timeout(1) {}
};
@@ -80,7 +76,7 @@ class FeatureCache : public FrontendInterface {
std::condition_variable ready_feed_condition_;
std::condition_variable ready_read_condition_;
- // DISALLOW_COPY_AND_ASSGIN(FeatureCache);
+ DISALLOW_COPY_AND_ASSIGN(FeatureCache);
};
} // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc
index 087de0f0d14cc7c760389b4fb69ad216e0b77db9..9cacff9f7610b1825b48671fa92b4a291c7f80d5 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@@ -35,8 +35,11 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
unique_ptr cmvn(
new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature)));
- base_extractor_.reset(
+ unique_ptr cache(
new ppspeech::FeatureCache(opts.feature_cache_opts, std::move(cmvn)));
+
+ base_extractor_.reset(
+ new ppspeech::Assembler(opts.assembler_opts, std::move(cache)));
}
-} // ppspeech
\ No newline at end of file
+} // ppspeech
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h
index 6b9b4795e5431b20116a1e7cbed59415e8f1e0c7..b848f548b3d252b28ea521f63b6daa07237a9bdf 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -23,6 +23,7 @@
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/linear_spectrogram.h"
#include "frontend/audio/normalizer.h"
+#include "frontend/audio/assembler.h"
namespace ppspeech {
@@ -33,13 +34,16 @@ struct FeaturePipelineOptions {
LinearSpectrogramOptions linear_spectrogram_opts;
FbankOptions fbank_opts;
FeatureCacheOptions feature_cache_opts;
+ AssemblerOptions assembler_opts;
+
FeaturePipelineOptions()
: cmvn_file(""),
to_float32(false), // true, only for linear feature
use_fbank(true),
linear_spectrogram_opts(),
fbank_opts(),
- feature_cache_opts() {}
+ feature_cache_opts(),
+ assembler_opts() {}
};
class FeaturePipeline : public FrontendInterface {
@@ -59,4 +63,4 @@ class FeaturePipeline : public FrontendInterface {
private:
std::unique_ptr base_extractor_;
};
-}
\ No newline at end of file
+}
diff --git a/speechx/speechx/protocol/CMakeLists.txt b/speechx/speechx/protocol/CMakeLists.txt
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..98b2f38b43a87e9b548c34d96a1f601e957e0045 100644
--- a/speechx/speechx/protocol/CMakeLists.txt
+++ b/speechx/speechx/protocol/CMakeLists.txt
@@ -0,0 +1,3 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+add_subdirectory(websocket)
diff --git a/speechx/speechx/protocol/websocket/CMakeLists.txt b/speechx/speechx/protocol/websocket/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c3454c399eb36355b756bfa2744361e957655a72
--- /dev/null
+++ b/speechx/speechx/protocol/websocket/CMakeLists.txt
@@ -0,0 +1,15 @@
+project(websocket)
+
+add_library(websocket STATIC
+ websocket_server.cc
+ websocket_client.cc
+)
+target_link_libraries(websocket PUBLIC frontend decoder nnet)
+
+add_executable(websocket_server_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_server_main.cc)
+target_include_directories(websocket_server_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(websocket_server_main PUBLIC fst websocket ${DEPS})
+
+add_executable(websocket_client_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_client_main.cc)
+target_include_directories(websocket_client_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(websocket_client_main PUBLIC fst websocket ${DEPS})
diff --git a/speechx/speechx/websocket/websocket_client.cc b/speechx/speechx/protocol/websocket/websocket_client.cc
similarity index 96%
rename from speechx/speechx/websocket/websocket_client.cc
rename to speechx/speechx/protocol/websocket/websocket_client.cc
index 6bd930b858aa10d15ac24397e2e29f33eeb22ebb..60e06db638d2121ad2ead24f9e5deaaae73160de 100644
--- a/speechx/speechx/websocket/websocket_client.cc
+++ b/speechx/speechx/protocol/websocket/websocket_client.cc
@@ -67,6 +67,9 @@ void WebSocketClient::ReadLoopFunc() {
if (obj["type"] == "final_result") {
result_ = obj["result"].as_string().c_str();
}
+ if (obj["type"] == "partial_result") {
+ partial_result_ = obj["result"].as_string().c_str();
+ }
if (obj["type"] == "speech_end") {
done_ = true;
break;
diff --git a/speechx/speechx/websocket/websocket_client.h b/speechx/speechx/protocol/websocket/websocket_client.h
similarity index 91%
rename from speechx/speechx/websocket/websocket_client.h
rename to speechx/speechx/protocol/websocket/websocket_client.h
index ac0aed310bd1f017550e3663a8589c740f769294..8635501a8e6a9d029d104083e51675a2820b232d 100644
--- a/speechx/speechx/websocket/websocket_client.h
+++ b/speechx/speechx/protocol/websocket/websocket_client.h
@@ -40,12 +40,14 @@ class WebSocketClient {
void SendEndSignal();
void SendDataEnd();
bool Done() const { return done_; }
- std::string GetResult() { return result_; }
+ std::string GetResult() const { return result_; }
+ std::string GetPartialResult() const { return partial_result_;}
private:
void Connect();
std::string host_;
std::string result_;
+ std::string partial_result_;
int port_;
bool done_ = false;
asio::io_context ioc_;
diff --git a/speechx/examples/ds2_ol/websocket/websocket_client_main.cc b/speechx/speechx/protocol/websocket/websocket_client_main.cc
similarity index 99%
rename from speechx/examples/ds2_ol/websocket/websocket_client_main.cc
rename to speechx/speechx/protocol/websocket/websocket_client_main.cc
index df658b0a2218b72af99c23d7a986aa4867732c6d..7ad36e3a563c58e40918666beca7b185115eb1cb 100644
--- a/speechx/examples/ds2_ol/websocket/websocket_client_main.cc
+++ b/speechx/speechx/protocol/websocket/websocket_client_main.cc
@@ -59,7 +59,6 @@ int main(int argc, char* argv[]) {
client.SendBinaryData(wav_chunk.data(),
wav_chunk.size() * sizeof(int16));
-
sample_offset += cur_chunk_size;
LOG(INFO) << "Send " << cur_chunk_size << " samples";
std::this_thread::sleep_for(
diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/protocol/websocket/websocket_server.cc
similarity index 98%
rename from speechx/speechx/websocket/websocket_server.cc
rename to speechx/speechx/protocol/websocket/websocket_server.cc
index 28c9eca4ee7776e8f1c4606dff60fa13b1a284bd..a1abd98e66b8f1bac00ff5276506241c5702d2e9 100644
--- a/speechx/speechx/websocket/websocket_server.cc
+++ b/speechx/speechx/protocol/websocket/websocket_server.cc
@@ -75,9 +75,10 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
CHECK(recognizer_ != nullptr);
recognizer_->Accept(pcm_data);
- // TODO: return lpartial result
+ std::string partial_result = recognizer_->GetPartialResult();
+
json::value rv = {
- {"status", "ok"}, {"type", "partial_result"}, {"result", "TODO"}};
+ {"status", "ok"}, {"type", "partial_result"}, {"result", partial_result}};
ws_.text(true);
ws_.write(asio::buffer(json::serialize(rv)));
}
diff --git a/speechx/speechx/websocket/websocket_server.h b/speechx/speechx/protocol/websocket/websocket_server.h
similarity index 98%
rename from speechx/speechx/websocket/websocket_server.h
rename to speechx/speechx/protocol/websocket/websocket_server.h
index 9ea88282ec5e60682daeabcccf0d4f2c09c114fb..009fc42ed827fed1258f241a3e48936bf71a7daf 100644
--- a/speechx/speechx/websocket/websocket_server.h
+++ b/speechx/speechx/protocol/websocket/websocket_server.h
@@ -44,7 +44,6 @@ class ConnectionHandler {
void OnFinish();
void OnSpeechData(const beast::flat_buffer& buffer);
void OnError(const std::string& message);
- void OnPartialResult(const std::string& result);
void OnFinalResult(const std::string& result);
void DecodeThreadFunc();
std::string SerializeResult(bool finish);
diff --git a/speechx/examples/ds2_ol/websocket/websocket_server_main.cc b/speechx/speechx/protocol/websocket/websocket_server_main.cc
similarity index 100%
rename from speechx/examples/ds2_ol/websocket/websocket_server_main.cc
rename to speechx/speechx/protocol/websocket/websocket_server_main.cc
diff --git a/speechx/speechx/websocket/CMakeLists.txt b/speechx/speechx/websocket/CMakeLists.txt
deleted file mode 100644
index 582a380312f461d0775f8bb0b03d8370c4fddfc2..0000000000000000000000000000000000000000
--- a/speechx/speechx/websocket/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-project(websocket)
-
-add_library(websocket STATIC
- websocket_server.cc
- websocket_client.cc
-)
-target_link_libraries(websocket PUBLIC frontend decoder nnet)
diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index e1f1853f6f0672486641fc6d1a7f1c8cc0c15eba..e0ebd1412ac9e2734f78948208fe1ed3b429d976 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -25,7 +25,7 @@ paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.w
# long audio restriction
{
wget -c https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav
-paddlespeech asr --input test_long_audio_01.wav
+paddlespeech asr --model deepspeech2online_wenetspeech --input test_long_audio_01.wav -y
if [ $? -ne 255 ]; then
echo -e "\e[1;31mTime restriction not passed\e[0m"
exit 1
@@ -54,7 +54,7 @@ paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input
# Speech Translation (only support linux)
paddlespeech st --input ./en.wav
-# Speaker Verification
+# Speaker Verification
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
paddlespeech vector --task spk --input 85236145389.wav
@@ -65,7 +65,7 @@ echo -e "demo1 85236145389.wav \n demo2 85236145389.wav" > vec.job
paddlespeech vector --task spk --input vec.job
echo -e "demo3 85236145389.wav \n demo4 85236145389.wav" | paddlespeech vector --task spk
-rm 85236145389.wav
+rm 85236145389.wav
rm vec.job
# shell pipeline
diff --git a/third_party/README.md b/third_party/README.md
index c73df5427b08fdfd12bbb7e779afd41b058f7780..843d0d3b2e69a573195c669ac0bbee8f52558960 100644
--- a/third_party/README.md
+++ b/third_party/README.md
@@ -1,27 +1,27 @@
* [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
commit: fc1bd6240c2008412ab64dc25045cd872f5e126c
ref: https://zhuanlan.zhihu.com/p/55371926
-licence: MIT
+license: MIT
* [python-pinyin](https://github.com/mozillazg/python-pinyin.git)
commit: 55e524aa1b7b8eec3d15c5306043c6cdd5938b03
-licence: MIT
+license: MIT
* [zhon](https://github.com/tsroten/zhon)
commit: 09bf543696277f71de502506984661a60d24494c
-licence: MIT
+license: MIT
* [pymmseg-cpp](https://github.com/pluskid/pymmseg-cpp.git)
commit: b76465045717fbb4f118c4fbdd24ce93bab10a6d
-licence: MIT
+license: MIT
* [chinese_text_normalization](https://github.com/speechio/chinese_text_normalization.git)
commit: 9e92c7bf2d6b5a7974305406d8e240045beac51c
-licence: MIT
+license: MIT
* [phkit](https://github.com/KuangDD/phkit.git)
commit: b2100293c1e36da531d7f30bd52c9b955a649522
-licence: None
+license: None
* [nnAudio](https://github.com/KinWaiCheuk/nnAudio.git)
-licence: MIT
+license: MIT
diff --git a/third_party/ctc_decoders/LICENSE b/third_party/ctc_decoders/LICENSE
index eeef74b3099776f6afa02dd3fe0ca7f304138248..ad947f8d75624fa3862c37db49a6f249bc11dafb 100644
--- a/third_party/ctc_decoders/LICENSE
+++ b/third_party/ctc_decoders/LICENSE
@@ -5,4 +5,4 @@ score.h and score.cpp is under the LGPL license.
The two files include the header files from KenLM project.
For the rest:
-The default licence of paddlespeech-ctcdecoders is Apache License 2.0.
+The default license of paddlespeech-ctcdecoders is Apache License 2.0.
diff --git a/utils/README.md b/utils/README.md
index 163be850f7279268290075bce104a7e77b2acc59..db2064efa9da61f9b395af02f4531e6845216aaa 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -1,4 +1,4 @@
# Utils
* [kaldi utils](https://github.com/kaldi-asr/kaldi/blob/cbed4ff688/egs/wsj/s5/utils)
-* [espnet utils)(https://github.com/espnet/espnet/tree/master/utils)
+* [espnet utils](https://github.com/espnet/espnet/tree/master/utils)
diff --git a/utils/compute-wer.py b/utils/compute-wer.py
index 978a80c9f1d845bc49ffed7a4350a4705bdd50b6..98bb24a7ebd5dff7c7193288caade9e50c03e670 100755
--- a/utils/compute-wer.py
+++ b/utils/compute-wer.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# CopyRight WeNet Apache-2.0 License
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
import codecs
import re
import sys
diff --git a/speechx/examples/ngram/zh/local/text_to_lexicon.py b/utils/text_to_lexicon.py
similarity index 100%
rename from speechx/examples/ngram/zh/local/text_to_lexicon.py
rename to utils/text_to_lexicon.py