[TTS] add svs frontend (#3062)

880c172d · liangym · GitHub · 46334ae0 · 880c172d · 880c172d
9 changed file
--- a/examples/opencpop/svs1/README.md
+++ b/examples/opencpop/svs1/README.md
@@ -70,7 +70,7 @@ Train a FastSpeech2 model.

 optional arguments:
  -h, --help            show this help message and exit
-  --config CONFIG       fastspeech2 config file.
+  --config CONFIG       diffsinger config file.
  --train-metadata TRAIN_METADATA
                        training data.
  --dev-metadata DEV_METADATA
@@ -126,6 +126,7 @@ optional arguments:
  -h, --help            show this help message and exit
  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                        Choose acoustic model type of tts task.
+       {diffsinger_opencpop} Choose acoustic model type of svs task.
  --am_config AM_CONFIG
                        Config of acoustic model.
  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
@@ -141,6 +142,7 @@ optional arguments:
                        whether training voice cloning model.
  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                        Choose vocoder type of tts task.
+        {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
  --voc_config VOC_CONFIG
                        Config of voc.
  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
@@ -151,9 +153,84 @@ optional arguments:
                        test metadata.
  --output_dir OUTPUT_DIR
                        output dir.
-  --speech-stretchs     mel min and max values file.
+  --speech-stretchs     SPEECH_STRETCHS
+                        The min and max values of the mel spectrum, using on diffusion of diffsinger.
 ```

+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. 
+`local/pinyin_to_phone.txt` comes from the readme of the opencpop dataset, indicating the mapping from pinyin to phonemes in opencpop.
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
+                         [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+                         [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+                         [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
+                         [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+                         [--voc_stat VOC_STAT] [--lang LANG]
+                         [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+                         [--text TEXT] [--output_dir OUTPUT_DIR]
+                         [--pinyin_phone PINYIN_PHONE]
+                         [--speech_stretchs SPEECH_STRETCHS]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
+                        Choose acoustic model type of tts task.
+       {diffsinger_opencpop} Choose acoustic model type of svs task.
+  --am_config AM_CONFIG
+                        Config of acoustic model.
+  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
+  --am_stat AM_STAT     mean and standard deviation used to normalize
+                        spectrogram when training acoustic model.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --speaker_dict SPEAKER_DICT
+                        speaker id map file.
+  --spk_id SPK_ID       spk id for multi speaker acoustic model
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
+                        Choose vocoder type of tts task.
+        {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
+  --voc_config VOC_CONFIG
+                        Config of voc.
+  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
+  --voc_stat VOC_STAT   mean and standard deviation used to normalize
+                        spectrogram when training voc.
+  --lang LANG           {zh, en, mix, canton} Choose language type of tts task.
+                        {sing} Choose language type of svs task.
+  --inference_dir INFERENCE_DIR
+                        dir to save inference models
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --text TEXT           text to synthesize file, a 'utt_id sentence' pair per line for tts task.
+                        A '{ utt_id input_type (is word) text notes note_durs}' or '{utt_id input_type (is phoneme) phones notes note_durs is_slurs}' pair per line for svs task.
+  --output_dir OUTPUT_DIR
+                        output dir.
+  --pinyin_phone PINYIN_PHONE
+                        pinyin to phone map file, using on sing_frontend.
+  --speech_stretchs SPEECH_STRETCHS
+                        The min and max values of the mel spectrum, using on diffusion of diffsinger.
+```
+1. `--am` is acoustic model type with the format {model_name}_{dataset}
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the diffsinger pretrained model.
+3. `--voc` is vocoder type with the format {model_name}_{dataset}
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+5. `--lang` is language. `zh`, `en`, `mix` and `canton` for tts task. `sing` for tts task.
+6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
+7. `--text` is the text file, which contains sentences to synthesize.
+8. `--output_dir` is the directory to save synthesized audio files.
+9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+10. `--inference_dir` is the directory to save static models. If this line is not added, it will not be generated and saved as a static model.
+11. `--pinyin_phone` pinyin to phone map file, using on sing_frontend.
+12. `--speech_stretchs` The min and max values of the mel spectrum, using on diffusion of diffsinger.
+
+Note: At present, the diffsinger model does not support dynamic to static, so do not add `--inference_dir`.
+

 ## Pretrained Model
 Pretrained DiffSinger model:
@@ -165,10 +242,35 @@ diffsinger_opencpop_ckpt_1.4.0.zip
 ├── default.yaml             # default config used to train diffsinger
 ├── energy_stats.npy         # statistics used to normalize energy when training diffsinger if norm is needed
 ├── phone_id_map.txt         # phone vocabulary file when training diffsinger
+├── pinyin_to_phone.txt      # pinyin-to-phoneme mapping file when training diffsinger
 ├── pitch_stats.npy          # statistics used to normalize pitch when training diffsinger if norm is needed 
 ├── snapshot_iter_160000.pdz # model parameters of diffsinger
 ├── speech_stats.npy         # statistics used to normalize mel when training diffsinger if norm is needed
-└── speech_stretchs.npy      # Min and max values to use for mel spectral stretching before training diffusion
+└── speech_stretchs.npy      # min and max values to use for mel spectral stretching before training diffusion
+
+```
+
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_sing.txt` using pretrained diffsinger and parallel wavegan models.
+
+```bash
+source path.sh

+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+  --am=diffsinger_opencpop \
+  --am_config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \
+  --am_ckpt=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \
+  --am_stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy  \
+  --voc=pwgan_opencpop \
+  --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
+  --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
+  --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
+  --lang=sing \
+  --text=${BIN_DIR}/../sentences_sing.txt \
+  --output_dir=exp/default/test_e2e \
+  --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
+  --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \
+  --speech_stretchs=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy
+  
 ```
-At present, the text frontend is not perfect, and the method of `synthesize_e2e` is not supported for synthesizing audio. Try using `synthesize` first.
\ No newline at end of file
--- a/examples/opencpop/svs1/README_cn.md
+++ b/examples/opencpop/svs1/README_cn.md
@@ -73,7 +73,7 @@ Train a DiffSinger model.

 optional arguments:
  -h, --help            show this help message and exit
-  --config CONFIG       fastspeech2 config file.
+  --config CONFIG       diffsinger config file.
  --train-metadata TRAIN_METADATA
                        training data.
  --dev-metadata DEV_METADATA
@@ -131,6 +131,7 @@ optional arguments:
  -h, --help            show this help message and exit
  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                        Choose acoustic model type of tts task.
+       {diffsinger_opencpop} Choose acoustic model type of svs task.
  --am_config AM_CONFIG
                        Config of acoustic model.
  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
@@ -146,6 +147,7 @@ optional arguments:
                        whether training voice cloning model.
  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                        Choose vocoder type of tts task.
+        {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
  --voc_config VOC_CONFIG
                        Config of voc.
  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
@@ -156,9 +158,85 @@ optional arguments:
                        test metadata.
  --output_dir OUTPUT_DIR
                        output dir.
-  --speech-stretchs     mel min and max values file.
+  --speech-stretchs     SPEECH_STRETCHS
+                        The min and max values of the mel spectrum, using on diffusion of diffsinger.
 ```

+`./local/synthesize_e2e.sh` 调用 `${BIN_DIR}/../synthesize_e2e.py`，即可从文本文件中合成波形。
+`local/pinyin_to_phone.txt`来源于opencpop数据集中的README，表示opencpop中拼音到音素的映射。
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
+                         [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+                         [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+                         [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
+                         [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+                         [--voc_stat VOC_STAT] [--lang LANG]
+                         [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+                         [--text TEXT] [--output_dir OUTPUT_DIR]
+                         [--pinyin_phone PINYIN_PHONE]
+                         [--speech_stretchs SPEECH_STRETCHS]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
+                        Choose acoustic model type of tts task.
+       {diffsinger_opencpop} Choose acoustic model type of svs task.
+  --am_config AM_CONFIG
+                        Config of acoustic model.
+  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
+  --am_stat AM_STAT     mean and standard deviation used to normalize
+                        spectrogram when training acoustic model.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --speaker_dict SPEAKER_DICT
+                        speaker id map file.
+  --spk_id SPK_ID       spk id for multi speaker acoustic model
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
+                        Choose vocoder type of tts task.
+        {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
+  --voc_config VOC_CONFIG
+                        Config of voc.
+  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
+  --voc_stat VOC_STAT   mean and standard deviation used to normalize
+                        spectrogram when training voc.
+  --lang LANG           {zh, en, mix, canton} Choose language type of tts task.
+                        {sing} Choose language type of svs task.
+  --inference_dir INFERENCE_DIR
+                        dir to save inference models
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --text TEXT           text to synthesize file, a 'utt_id sentence' pair per line for tts task.
+                        A '{ utt_id input_type (is word) text notes note_durs}' or '{utt_id input_type (is phoneme) phones notes note_durs is_slurs}' pair per line for svs task.
+  --output_dir OUTPUT_DIR
+                        output dir.
+  --pinyin_phone PINYIN_PHONE
+                        pinyin to phone map file, using on sing_frontend.
+  --speech_stretchs SPEECH_STRETCHS
+                        The min and max values of the mel spectrum, using on diffusion of diffsinger.
+```
+1. `--am` 声学模型格式是否符合 {model_name}_{dataset}
+2. `--am_config`, `--am_ckpt`, `--am_stat` 和 `--phones_dict` 是声学模型的参数，对应于 diffsinger 预训练模型中的 4 个文件。
+3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset}
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` 是声码器的参数，对应于 parallel wavegan 预训练模型中的 3 个文件。
+5. `--lang` tts对应模型的语言可以是 `zh`、`en`、`mix`和`canton`。 svs 对应的语言是 `sing` 。
+6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、
+7. `--text` 是文本文件，其中包含要合成的句子。
+8. `--output_dir` 是保存合成音频文件的目录。
+9. `--ngpu` 要使用的GPU数，如果 ngpu==0，则使用 cpu。
+10. `--inference_dir` 静态模型保存的目录。如果不加这一行，就不会生并保存成静态模型。
+11. `--pinyin_phone` 拼音到音素的映射文件。
+12. `--speech_stretchs` mel谱的最大最小值用于diffsinger中diffusion之前的线性拉伸。
+
+注意： 目前 diffsinger 模型还不支持动转静，所以不要加 `--inference_dir`。
+
+
 ## 预训练模型
 预先训练的 DiffSinger 模型：
 - [diffsinger_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/diffsinger_opencpop_ckpt_1.4.0.zip)
@@ -170,10 +248,33 @@ diffsinger_opencpop_ckpt_1.4.0.zip
 ├── default.yaml             # 用于训练 diffsinger 的默认配置
 ├── energy_stats.npy         # 训练 diffsinger 时如若需要 norm energy 会使用到的统计数据 
 ├── phone_id_map.txt         # 训练 diffsinger 时的音素词汇文件
+├── pinyin_to_phone.txt      # 训练 diffsinger 时的拼音到音素映射文件
 ├── pitch_stats.npy          # 训练 diffsinger 时如若需要 norm pitch 会使用到的统计数据 
 ├── snapshot_iter_160000.pdz # 模型参数和优化器状态
 ├── speech_stats.npy         # 训练 diffsinger 时用于规范化频谱图的统计数据
 └── speech_stretchs.npy      # 训练 diffusion 前用于 mel 谱拉伸的最小及最大值

 ```
-目前文本前端未完善，暂不支持 `synthesize_e2e` 的方式合成音频。尝试效果可先使用 `synthesize`。
+您可以使用以下脚本通过使用预训练的 diffsinger 和 parallel wavegan 模型为 `${BIN_DIR}/../sentences_sing.txt` 合成句子
+```bash
+source path.sh
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+  --am=diffsinger_opencpop \
+  --am_config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \
+  --am_ckpt=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \
+  --am_stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy  \
+  --voc=pwgan_opencpop \
+  --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
+  --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
+  --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
+  --lang=sing \
+  --text=${BIN_DIR}/../sentences_sing.txt \
+  --output_dir=exp/default/test_e2e \
+  --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
+  --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \
+  --speech_stretchs=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy
+  
+```
--- a/examples/opencpop/svs1/local/pinyin_to_phone.txt
+++ b/examples/opencpop/svs1/local/pinyin_to_phone.txt
+a|a
+ai|ai
+an|an
+ang|ang
+ao|ao
+ba|b a
+bai|b ai
+ban|b an
+bang|b ang
+bao|b ao
+bei|b ei
+ben|b en
+beng|b eng
+bi|b i
+bian|b ian
+biao|b iao
+bie|b ie
+bin|b in
+bing|b ing
+bo|b o
+bu|b u
+ca|c a
+cai|c ai
+can|c an
+cang|c ang
+cao|c ao
+ce|c e
+cei|c ei
+cen|c en
+ceng|c eng
+cha|ch a
+chai|ch ai
+chan|ch an
+chang|ch ang
+chao|ch ao
+che|ch e
+chen|ch en
+cheng|ch eng
+chi|ch i
+chong|ch ong
+chou|ch ou
+chu|ch u
+chua|ch ua
+chuai|ch uai
+chuan|ch uan
+chuang|ch uang
+chui|ch ui
+chun|ch un
+chuo|ch uo
+ci|c i
+cong|c ong
+cou|c ou
+cu|c u
+cuan|c uan
+cui|c ui
+cun|c un
+cuo|c uo
+da|d a
+dai|d ai
+dan|d an
+dang|d ang
+dao|d ao
+de|d e
+dei|d ei
+den|d en
+deng|d eng
+di|d i
+dia|d ia
+dian|d ian
+diao|d iao
+die|d ie
+ding|d ing
+diu|d iu
+dong|d ong
+dou|d ou
+du|d u
+duan|d uan
+dui|d ui
+dun|d un
+duo|d uo
+e|e
+ei|ei
+en|en
+eng|eng
+er|er
+fa|f a
+fan|f an
+fang|f ang
+fei|f ei
+fen|f en
+feng|f eng
+fo|f o
+fou|f ou
+fu|f u
+ga|g a
+gai|g ai
+gan|g an
+gang|g ang
+gao|g ao
+ge|g e
+gei|g ei
+gen|g en
+geng|g eng
+gong|g ong
+gou|g ou
+gu|g u
+gua|g ua
+guai|g uai
+guan|g uan
+guang|g uang
+gui|g ui
+gun|g un
+guo|g uo
+ha|h a
+hai|h ai
+han|h an
+hang|h ang
+hao|h ao
+he|h e
+hei|h ei
+hen|h en
+heng|h eng
+hm|h m
+hng|h ng
+hong|h ong
+hou|h ou
+hu|h u
+hua|h ua
+huai|h uai
+huan|h uan
+huang|h uang
+hui|h ui
+hun|h un
+huo|h uo
+ji|j i
+jia|j ia
+jian|j ian
+jiang|j iang
+jiao|j iao
+jie|j ie
+jin|j in
+jing|j ing
+jiong|j iong
+jiu|j iu
+ju|j v
+juan|j van
+jue|j ve
+jun|j vn
+ka|k a
+kai|k ai
+kan|k an
+kang|k ang
+kao|k ao
+ke|k e
+kei|k ei
+ken|k en
+keng|k eng
+kong|k ong
+kou|k ou
+ku|k u
+kua|k ua
+kuai|k uai
+kuan|k uan
+kuang|k uang
+kui|k ui
+kun|k un
+kuo|k uo
+la|l a
+lai|l ai
+lan|l an
+lang|l ang
+lao|l ao
+le|l e
+lei|l ei
+leng|l eng
+li|l i
+lia|l ia
+lian|l ian
+liang|l iang
+liao|l iao
+lie|l ie
+lin|l in
+ling|l ing
+liu|l iu
+lo|l o
+long|l ong
+lou|l ou
+lu|l u
+luan|l uan
+lun|l un
+luo|l uo
+lv|l v
+lve|l ve
+m|m
+ma|m a
+mai|m ai
+man|m an
+mang|m ang
+mao|m ao
+me|m e
+mei|m ei
+men|m en
+meng|m eng
+mi|m i
+mian|m ian
+miao|m iao
+mie|m ie
+min|m in
+ming|m ing
+miu|m iu
+mo|m o
+mou|m ou
+mu|m u
+n|n
+na|n a
+nai|n ai
+nan|n an
+nang|n ang
+nao|n ao
+ne|n e
+nei|n ei
+nen|n en
+neng|n eng
+ng|n g
+ni|n i
+nian|n ian
+niang|n iang
+niao|n iao
+nie|n ie
+nin|n in
+ning|n ing
+niu|n iu
+nong|n ong
+nou|n ou
+nu|n u
+nuan|n uan
+nun|n un
+nuo|n uo
+nv|n v
+nve|n ve
+o|o
+ou|ou
+pa|p a
+pai|p ai
+pan|p an
+pang|p ang
+pao|p ao
+pei|p ei
+pen|p en
+peng|p eng
+pi|p i
+pian|p ian
+piao|p iao
+pie|p ie
+pin|p in
+ping|p ing
+po|p o
+pou|p ou
+pu|p u
+qi|q i
+qia|q ia
+qian|q ian
+qiang|q iang
+qiao|q iao
+qie|q ie
+qin|q in
+qing|q ing
+qiong|q iong
+qiu|q iu
+qu|q v
+quan|q van
+que|q ve
+qun|q vn
+ran|r an
+rang|r ang
+rao|r ao
+re|r e
+ren|r en
+reng|r eng
+ri|r i
+rong|r ong
+rou|r ou
+ru|r u
+rua|r ua
+ruan|r uan
+rui|r ui
+run|r un
+ruo|r uo
+sa|s a
+sai|s ai
+san|s an
+sang|s ang
+sao|s ao
+se|s e
+sen|s en
+seng|s eng
+sha|sh a
+shai|sh ai
+shan|sh an
+shang|sh ang
+shao|sh ao
+she|sh e
+shei|sh ei
+shen|sh en
+sheng|sh eng
+shi|sh i
+shou|sh ou
+shu|sh u
+shua|sh ua
+shuai|sh uai
+shuan|sh uan
+shuang|sh uang
+shui|sh ui
+shun|sh un
+shuo|sh uo
+si|s i
+song|s ong
+sou|s ou
+su|s u
+suan|s uan
+sui|s ui
+sun|s un
+suo|s uo
+ta|t a
+tai|t ai
+tan|t an
+tang|t ang
+tao|t ao
+te|t e
+tei|t ei
+teng|t eng
+ti|t i
+tian|t ian
+tiao|t iao
+tie|t ie
+ting|t ing
+tong|t ong
+tou|t ou
+tu|t u
+tuan|t uan
+tui|t ui
+tun|t un
+tuo|t uo
+wa|w a
+wai|w ai
+wan|w an
+wang|w ang
+wei|w ei
+wen|w en
+weng|w eng
+wo|w o
+wu|w u
+xi|x i
+xia|x ia
+xian|x ian
+xiang|x iang
+xiao|x iao
+xie|x ie
+xin|x in
+xing|x ing
+xiong|x iong
+xiu|x iu
+xu|x v
+xuan|x van
+xue|x ve
+xun|x vn
+ya|y a
+yan|y an
+yang|y ang
+yao|y ao
+ye|y e
+yi|y i
+yin|y in
+ying|y ing
+yo|y o
+yong|y ong
+you|y ou
+yu|y v
+yuan|y van
+yue|y ve
+yun|y vn
+za|z a
+zai|z ai
+zan|z an
+zang|z ang
+zao|z ao
+ze|z e
+zei|z ei
+zen|z en
+zeng|z eng
+zha|zh a
+zhai|zh ai
+zhan|zh an
+zhang|zh ang
+zhao|zh ao
+zhe|zh e
+zhei|zh ei
+zhen|zh en
+zheng|zh eng
+zhi|zh i
+zhong|zh ong
+zhou|zh ou
+zhu|zh u
+zhua|zh ua
+zhuai|zh uai
+zhuan|zh uan
+zhuang|zh uang
+zhui|zh ui
+zhun|zh un
+zhuo|zh uo
+zi|z i
+zong|z ong
+zou|z ou
+zu|z u
+zuan|z uan
+zui|z ui
+zun|z un
+zuo|z uo
\ No newline at end of file
--- a/examples/opencpop/svs1/local/synthesize_e2e.sh
+++ b/examples/opencpop/svs1/local/synthesize_e2e.sh
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=diffsinger_opencpop \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_opencpop \
+        --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
+        --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
+        --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
+        --lang=sing \
+        --text=${BIN_DIR}/../sentences_sing.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speech_stretchs=dump/train/speech_stretchs.npy \
+        --pinyin_phone=local/pinyin_to_phone.txt
+fi
+
+# for more GAN Vocoders
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "in hifigan syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=diffsinger_opencpop \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_opencpop \
+        --voc_config=hifigan_opencpop_ckpt_1.4.0/default.yaml \
+        --voc_ckpt=hifigan_opencpop_ckpt_1.4.0/snapshot_iter_625000.pdz \
+        --voc_stat=hifigan_opencpop_ckpt_1.4.0/feats_stats.npy \
+        --lang=sing \
+        --text=${BIN_DIR}/../sentences_sing.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speech_stretchs=dump/train/speech_stretchs.npy \
+        --pinyin_phone=local/pinyin_to_phone.txt
+        
+fi
--- a/examples/opencpop/svs1/run.sh
+++ b/examples/opencpop/svs1/run.sh
@@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # synthesize, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/paddlespeech/t2s/exps/sentences_sing.txt
+++ b/paddlespeech/t2s/exps/sentences_sing.txt
+{"utt_id": "2093003457", "input_type": "word", "text": "小酒窝长睫毛AP是你最美的记号", "notes": "C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4", "note_durs": "0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340"}
+{"utt_id": "2093003458", "input_type": "phoneme", "phones": "w o m ei t ian sh ui ui b u u zh ao AP x iang n ian n i d e w ei x iao iao AP" , "notes": "C#4/Db4 C#4/Db4 D#4/Eb4 D#4/Eb4 F4 F4 F#4/Gb4 F#4/Gb4 D#4/Eb4 D#4/Eb4 D#4/Eb4 A#3/Bb3 A#3/Bb3 A#3/Bb3 rest F#4/Gb4 F#4/Gb4 F4 F4 F#4/Gb4 F#4/Gb4 F4 F4 G#4/Ab4 G#4/Ab4 D#4/Eb4 D#4/Eb4 C#4/Db4 rest", "note_durs": "0.221750 0.221750 0.414460 0.414460 0.223160 0.223160 0.430900 0.430900 0.335990 0.269270 0.269270 0.289060 0.522690 0.522690 0.355060 0.397130 0.397130 0.247690 0.247690 0.406720 0.406720 0.246830 0.246830 0.307540 0.307540 0.429910 0.429910 0.519130 0.342300", "is_slurs": "0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0"}
\ No newline at end of file
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -20,6 +20,7 @@ from typing import Dict
 from typing import List
 from typing import Optional

+import jsonlines
 import numpy as np
 import onnxruntime as ort
 import paddle
@@ -35,6 +36,7 @@ from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static
 from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.canton_frontend import CantonFrontend
 from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
+from paddlespeech.t2s.frontend.sing_frontend import SingFrontend
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
 from paddlespeech.utils.dynamic_import import dynamic_import
@@ -127,6 +129,19 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
    return sentences


+# input for svs
+def get_sentences_svs(text_file: Optional[os.PathLike]):
+    # construct dataset for evaluation
+    sentences = []
+    with jsonlines.open(text_file, 'r') as reader:
+        svs_inputs = list(reader)
+    for svs_input in svs_inputs:
+        utt_id = svs_input['utt_id']
+        sentence = svs_input
+        sentences.append((utt_id, sentence))
+    return sentences
+
+
 # am only
 def get_test_dataset(test_metadata: List[Dict[str, Any]],
                     am: str,
@@ -268,6 +283,7 @@ def get_dev_dataloader(dev_metadata: List[Dict[str, Any]],
 def get_frontend(lang: str='zh',
                 phones_dict: Optional[os.PathLike]=None,
                 tones_dict: Optional[os.PathLike]=None,
+                 pinyin_phone: Optional[os.PathLike]=None,
                 use_rhy=False):
    if lang == 'zh':
        frontend = Frontend(
@@ -281,18 +297,23 @@ def get_frontend(lang: str='zh',
    elif lang == 'mix':
        frontend = MixFrontend(
            phone_vocab_path=phones_dict, tone_vocab_path=tones_dict)
+    elif lang == 'sing':
+        frontend = SingFrontend(
+            pinyin_phone_path=pinyin_phone, phone_vocab_path=phones_dict)
    else:
        print("wrong lang!")
    return frontend


-def run_frontend(frontend: object,
-                 text: str,
-                 merge_sentences: bool=False,
-                 get_tone_ids: bool=False,
-                 lang: str='zh',
-                 to_tensor: bool=True,
-                 add_blank: bool=False):
+def run_frontend(
+        frontend: object,
+        text: str,
+        merge_sentences: bool=False,
+        get_tone_ids: bool=False,
+        lang: str='zh',
+        to_tensor: bool=True,
+        add_blank: bool=False,
+        svs_input: Dict[str, str]=None, ):
    outs = dict()
    if lang == 'zh':
        input_ids = {}
@@ -326,8 +347,18 @@ def run_frontend(frontend: object,
        input_ids = frontend.get_input_ids(
            text, merge_sentences=merge_sentences, to_tensor=to_tensor)
        phone_ids = input_ids["phone_ids"]
+    elif lang == 'sing':
+        input_ids = frontend.get_input_ids(
+            svs_input=svs_input, to_tensor=to_tensor)
+        phone_ids = input_ids["phone_ids"]
+        note_ids = input_ids["note_ids"]
+        note_durs = input_ids["note_durs"]
+        is_slurs = input_ids["is_slurs"]
+        outs.update({'note_ids': note_ids})
+        outs.update({'note_durs': note_durs})
+        outs.update({'is_slurs': is_slurs})
    else:
-        print("lang should in {'zh', 'en', 'mix', 'canton'}!")
+        print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!")
    outs.update({'phone_ids': phone_ids})
    return outs

@@ -474,6 +505,7 @@ def am_to_static(am_inference,
    elif am_name == 'tacotron2':
        am_inference = jit.to_static(
            am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
+
    elif am_name == 'vits':
        if am_dataset in {"aishell3", "vctk"} and speaker_dict is not None:
            am_inference = jit.to_static(
@@ -485,8 +517,20 @@ def am_to_static(am_inference,
        else:
            am_inference = jit.to_static(
                am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
+
+    elif am_name == 'diffsinger':
+        am_inference = jit.to_static(
+            am_inference,
+            input_spec=[
+                InputSpec([-1], dtype=paddle.int64),  # phone
+                InputSpec([-1], dtype=paddle.int64),  # note
+                InputSpec([-1], dtype=paddle.float32),  # note_dur
+                InputSpec([-1], dtype=paddle.int64),  # is_slur
+            ])
+
    jit.save(am_inference, os.path.join(inference_dir, am))
    am_inference = jit.load(os.path.join(inference_dir, am))
+
    return am_inference



--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -24,6 +24,7 @@ from paddlespeech.t2s.exps.syn_utils import am_to_static
 from paddlespeech.t2s.exps.syn_utils import get_am_inference
 from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_sentences_svs
 from paddlespeech.t2s.exps.syn_utils import get_voc_inference
 from paddlespeech.t2s.exps.syn_utils import run_frontend
 from paddlespeech.t2s.exps.syn_utils import voc_to_static
@@ -44,20 +45,18 @@ def evaluate(args):
    print(am_config)
    print(voc_config)

-    sentences = get_sentences(text_file=args.text, lang=args.lang)
-
    # frontend
    frontend = get_frontend(
        lang=args.lang,
        phones_dict=args.phones_dict,
        tones_dict=args.tones_dict,
+        pinyin_phone=args.pinyin_phone,
        use_rhy=args.use_rhy)
    print("frontend done!")

    # acoustic model
    am_name = args.am[:args.am.rindex('_')]
    am_dataset = args.am[args.am.rindex('_') + 1:]
-
    am_inference = get_am_inference(
        am=args.am,
        am_config=am_config,
@@ -65,8 +64,10 @@ def evaluate(args):
        am_stat=args.am_stat,
        phones_dict=args.phones_dict,
        tones_dict=args.tones_dict,
-        speaker_dict=args.speaker_dict)
+        speaker_dict=args.speaker_dict,
+        speech_stretchs=args.speech_stretchs, )
    print("acoustic model done!")
+
    # vocoder
    voc_inference = get_voc_inference(
        voc=args.voc,
@@ -103,14 +104,25 @@ def evaluate(args):

    N = 0
    T = 0
+    if am_name == 'diffsinger':
+        sentences = get_sentences_svs(text_file=args.text)
+    else:
+        sentences = get_sentences(text_file=args.text, lang=args.lang)
    for utt_id, sentence in sentences:
        with timer() as t:
+            if am_name == "diffsinger":
+                text = ""
+                svs_input = sentence
+            else:
+                text = sentence
+                svs_input = None
            frontend_dict = run_frontend(
                frontend=frontend,
-                text=sentence,
+                text=text,
                merge_sentences=merge_sentences,
                get_tone_ids=get_tone_ids,
-                lang=args.lang)
+                lang=args.lang,
+                svs_input=svs_input)
            phone_ids = frontend_dict['phone_ids']
            with paddle.no_grad():
                flags = 0
@@ -134,6 +146,15 @@ def evaluate(args):
                            mel = am_inference(part_phone_ids, part_tone_ids)
                    elif am_name == 'tacotron2':
                        mel = am_inference(part_phone_ids)
+                    elif am_name == 'diffsinger':
+                        part_note_ids = frontend_dict['note_ids'][i]
+                        part_note_durs = frontend_dict['note_durs'][i]
+                        part_is_slurs = frontend_dict['is_slurs'][i]
+                        mel = am_inference(
+                            text=part_phone_ids,
+                            note=part_note_ids,
+                            note_dur=part_note_durs,
+                            is_slur=part_is_slurs, )
                    # vocoder
                    wav = voc_inference(mel)
                    if flags == 0:
@@ -178,6 +199,7 @@ def parse_args():
            'fastspeech2_male-zh',
            'fastspeech2_male-en',
            'fastspeech2_male-mix',
+            'diffsinger_opencpop',
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
@@ -223,6 +245,8 @@ def parse_args():
            'wavernn_csmsc',
            'pwgan_male',
            'hifigan_male',
+            'pwgan_opencpop',
+            'hifigan_opencpop',
        ],
        help='Choose vocoder type of tts task.')
    parser.add_argument(
@@ -240,6 +264,7 @@ def parse_args():
        '--lang',
        type=str,
        default='zh',
+        choices=['zh', 'en', 'mix', 'canton', 'sing'],
        help='Choose model language. zh or en or mix')

    parser.add_argument(
@@ -259,6 +284,17 @@ def parse_args():
        type=str2bool,
        default=False,
        help="run rhythm frontend or not")
+    parser.add_argument(
+        "--pinyin_phone",
+        type=str,
+        default=None,
+        help="pinyin to phone map file, using on sing_frontend.")
+    parser.add_argument(
+        "--speech_stretchs",
+        type=str,
+        default=None,
+        help="The min and max values of the mel spectrum, using on diffusion of diffsinger."
+    )

    args = parser.parse_args()
    return args

--- a/paddlespeech/t2s/frontend/sing_frontend.py
+++ b/paddlespeech/t2s/frontend/sing_frontend.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import Dict
+from typing import List
+
+import librosa
+import numpy as np
+import paddle
+from pypinyin import lazy_pinyin
+
+
+class SingFrontend():
+    def __init__(self, pinyin_phone_path: str, phone_vocab_path: str):
+        """SVS Frontend
+
+        Args:
+            pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line.
+            phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line.
+        """
+        self.punc = '[：，；。？！“”‘’\':,;.?!]'
+
+        self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'}
+        if pinyin_phone_path:
+            with open(pinyin_phone_path, 'rt', encoding='utf-8') as f:
+                for line in f.readlines():
+                    pinyin_phn = [
+                        x.strip() for x in line.split('|') if x.strip() != ''
+                    ]
+                    self.pinyin_phones[pinyin_phn[0]] = pinyin_phn[1]
+
+        self.vocab_phones = {}
+        if phone_vocab_path:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
+                phn_id = [line.strip().split() for line in f.readlines()]
+            for phn, id in phn_id:
+                self.vocab_phones[phn] = int(id)
+
+    def get_phones(self, sentence: str) -> List[int]:
+        """get phone list
+
+        Args:
+            sentence (str): sentence
+
+        Returns:
+            List[int]: phones list
+
+        Example:
+            sentence = "你好"
+            phones = ['n i', 'h ao']
+        """
+        # remove all punc
+        sentence = re.sub(self.punc, "", sentence)
+
+        # Pypinyin can't solve polyphonic words
+        sentence = sentence.replace('最长', '最常').replace('长睫毛', '常睫毛') \
+            .replace('那么长', '那么常').replace('多长', '多常') \
+            .replace('很长', '很常')
+
+        # lyric
+        pinyins = lazy_pinyin(sentence, strict=False)
+        # replace unk word with SP
+        pinyins = [
+            pinyin if pinyin in self.pinyin_phones.keys() else "SP"
+            for pinyin in pinyins
+        ]
+        phones = [
+            self.pinyin_phones[pinyin.strip()] for pinyin in pinyins
+            if pinyin.strip() in self.pinyin_phones
+        ]
+
+        return phones
+
+    def get_note_info(self, note_info: str) -> List[str]:
+        note_info = [x.strip() for x in note_info.split('|') if x.strip() != '']
+        return note_info
+
+    def process(
+            self,
+            phones: List[int],
+            notes: List[str],
+            note_durs: List[float], ) -> Dict[str, List[paddle.Tensor]]:
+        new_phones = []
+        new_notes = []
+        new_note_durs = []
+        is_slurs = []
+        assert len(phones) == len(notes) == len(
+            note_durs
+        ), "Please check the input, text, notes, note_durs should be the same length."
+        for i in range(len(phones)):
+            phone = phones[i].split()
+            note = notes[i].split()
+            note_dur = note_durs[i].split()
+
+            for phn in phone:
+                new_phones.append(phn)
+                new_notes.append(note[0])
+                new_note_durs.append(note_dur[0])
+                is_slurs.append(0)
+
+            if len(note) > 1:
+                for i in range(1, len(note)):
+                    new_phones.append(phone[-1])
+                    new_notes.append(note[i])
+                    new_note_durs.append(note_dur[i])
+                    is_slurs.append(1)
+
+        return new_phones, new_notes, new_note_durs, is_slurs
+
+    def get_input_ids(self, svs_input: Dict[str, str],
+                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
+        """convert input to int/float.
+
+        Args:
+            svs_input (Dict[str, str]): include keys: if input_type is phones, phones, notes, note_durs and is_slurs are needed.
+            if  input_type is word, text, notes, and note_durs sre needed.
+            to_tensor (bool, optional): whether to convert to Tensor. Defaults to True.
+
+        Returns:
+            Dict[str, List[paddle.Tensor]]: result include phone_ids, note_ids, note_durs, is_slurs.
+        """
+        result = {}
+        input_type = svs_input['input_type']
+        if input_type == 'phoneme':
+            assert "phones" in svs_input.keys() and "notes" in svs_input.keys() and "note_durs" in svs_input.keys() and "is_slurs" in svs_input.keys(), \
+                "When input_type is phoneme, phones, notes, note_durs, is_slurs should be in the svs_input."
+            phones = svs_input["phones"].split()
+            notes = svs_input["notes"].split()
+            note_durs = svs_input["note_durs"].split()
+            is_slurs = svs_input["is_slurs"].split()
+            assert len(phones) == len(notes) == len(note_durs) == len(
+                is_slurs
+            ), "Please check the input, phones, notes, note_durs is_slurs should be the same length."
+        elif input_type == "word":
+            assert "text" in svs_input.keys() and "notes" in svs_input.keys() and "note_durs" in svs_input.keys(), \
+                "When input_type is word, text, notes, note_durs, should be in the svs_input."
+            phones = self.get_phones(svs_input['text'])
+            notes = self.get_note_info(svs_input['notes'])
+            note_durs = self.get_note_info(svs_input['note_durs'])
+            phones, notes, note_durs, is_slurs = self.process(
+                phones=phones, notes=notes, note_durs=note_durs)
+
+        phone_ids = [self.vocab_phones[phn] for phn in phones]
+        phone_ids = np.array(phone_ids, np.int64)
+        note_ids = [
+            librosa.note_to_midi(note.split("/")[0]) if note != 'rest' else 0
+            for note in notes
+        ]
+        note_ids = np.array(note_ids, np.int64)
+        note_durs = np.array(note_durs, np.float32)
+        is_slurs = np.array(is_slurs, np.int64)
+
+        if to_tensor:
+            phone_ids = paddle.to_tensor(phone_ids)
+            note_ids = paddle.to_tensor(note_ids)
+            note_durs = paddle.to_tensor(note_durs)
+            is_slurs = paddle.to_tensor(is_slurs)
+
+        result['phone_ids'] = [phone_ids]
+        result['note_ids'] = [note_ids]
+        result['note_durs'] = [note_durs]
+        result['is_slurs'] = [is_slurs]
+
+        return result