未验证 提交 68bd85a6 编写于 作者: W WongLaw 提交者: GitHub

Merge branch 'PaddlePaddle:develop' into develop

......@@ -61,7 +61,7 @@ tts_python:
phones_dict:
tones_dict:
speaker_dict:
spk_id: 0
# voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
# 'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc',
......@@ -87,7 +87,7 @@ tts_inference:
phones_dict:
tones_dict:
speaker_dict:
spk_id: 0
am_predictor_conf:
device: # set 'gpu:id' or 'cpu'
......
......@@ -29,7 +29,7 @@ tts_online:
phones_dict:
tones_dict:
speaker_dict:
spk_id: 0
# voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
# Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
......@@ -70,7 +70,6 @@ tts_online-onnx:
phones_dict:
tones_dict:
speaker_dict:
spk_id: 0
am_sample_rate: 24000
am_sess_conf:
device: "cpu" # set 'gpu:id' or 'cpu'
......
......@@ -29,7 +29,7 @@ tts_online:
phones_dict:
tones_dict:
speaker_dict:
spk_id: 0
# voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
# Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
......@@ -70,7 +70,6 @@ tts_online-onnx:
phones_dict:
tones_dict:
speaker_dict:
spk_id: 0
am_sample_rate: 24000
am_sess_conf:
device: "cpu" # set 'gpu:id' or 'cpu'
......
......@@ -20,4 +20,7 @@ Subpackages
paddlespeech.audio.io
paddlespeech.audio.metric
paddlespeech.audio.sox_effects
paddlespeech.audio.streamdata
paddlespeech.audio.text
paddlespeech.audio.transform
paddlespeech.audio.utils
paddlespeech.audio.streamdata.autodecode module
===============================================
.. automodule:: paddlespeech.audio.streamdata.autodecode
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.streamdata.cache module
==========================================
.. automodule:: paddlespeech.audio.streamdata.cache
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.streamdata.compat module
===========================================
.. automodule:: paddlespeech.audio.streamdata.compat
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.streamdata.extradatasets module
==================================================
.. automodule:: paddlespeech.audio.streamdata.extradatasets
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.streamdata.filters module
============================================
.. automodule:: paddlespeech.audio.streamdata.filters
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.streamdata.gopen module
==========================================
.. automodule:: paddlespeech.audio.streamdata.gopen
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.streamdata.handlers module
=============================================
.. automodule:: paddlespeech.audio.streamdata.handlers
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.streamdata.mix module
========================================
.. automodule:: paddlespeech.audio.streamdata.mix
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.streamdata.paddle\_utils module
==================================================
.. automodule:: paddlespeech.audio.streamdata.paddle_utils
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.streamdata.pipeline module
=============================================
.. automodule:: paddlespeech.audio.streamdata.pipeline
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.streamdata package
=====================================
.. automodule:: paddlespeech.audio.streamdata
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.streamdata.autodecode
paddlespeech.audio.streamdata.cache
paddlespeech.audio.streamdata.compat
paddlespeech.audio.streamdata.extradatasets
paddlespeech.audio.streamdata.filters
paddlespeech.audio.streamdata.gopen
paddlespeech.audio.streamdata.handlers
paddlespeech.audio.streamdata.mix
paddlespeech.audio.streamdata.paddle_utils
paddlespeech.audio.streamdata.pipeline
paddlespeech.audio.streamdata.shardlists
paddlespeech.audio.streamdata.tariterators
paddlespeech.audio.streamdata.utils
paddlespeech.audio.streamdata.writer
paddlespeech.audio.streamdata.shardlists module
===============================================
.. automodule:: paddlespeech.audio.streamdata.shardlists
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.streamdata.tariterators module
=================================================
.. automodule:: paddlespeech.audio.streamdata.tariterators
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.streamdata.utils module
==========================================
.. automodule:: paddlespeech.audio.streamdata.utils
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.streamdata.writer module
===========================================
.. automodule:: paddlespeech.audio.streamdata.writer
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.text package
===============================
.. automodule:: paddlespeech.audio.text
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.text.text_featurizer
paddlespeech.audio.text.utility
paddlespeech.audio.text.text\_featurizer module
===============================================
.. automodule:: paddlespeech.audio.text.text_featurizer
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.text.utility module
======================================
.. automodule:: paddlespeech.audio.text.utility
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.transform.add\_deltas module
===============================================
.. automodule:: paddlespeech.audio.transform.add_deltas
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.transform.channel\_selector module
=====================================================
.. automodule:: paddlespeech.audio.transform.channel_selector
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.transform.cmvn module
========================================
.. automodule:: paddlespeech.audio.transform.cmvn
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.transform.functional module
==============================================
.. automodule:: paddlespeech.audio.transform.functional
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.transform.perturb module
===========================================
.. automodule:: paddlespeech.audio.transform.perturb
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.transform package
====================================
.. automodule:: paddlespeech.audio.transform
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.audio.transform.add_deltas
paddlespeech.audio.transform.channel_selector
paddlespeech.audio.transform.cmvn
paddlespeech.audio.transform.functional
paddlespeech.audio.transform.perturb
paddlespeech.audio.transform.spec_augment
paddlespeech.audio.transform.spectrogram
paddlespeech.audio.transform.transform_interface
paddlespeech.audio.transform.transformation
paddlespeech.audio.transform.wpe
paddlespeech.audio.transform.spec\_augment module
=================================================
.. automodule:: paddlespeech.audio.transform.spec_augment
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.transform.spectrogram module
===============================================
.. automodule:: paddlespeech.audio.transform.spectrogram
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.transform.transform\_interface module
========================================================
.. automodule:: paddlespeech.audio.transform.transform_interface
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.transform.transformation module
==================================================
.. automodule:: paddlespeech.audio.transform.transformation
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.transform.wpe module
=======================================
.. automodule:: paddlespeech.audio.transform.wpe
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.utils.check\_kwargs module
=============================================
.. automodule:: paddlespeech.audio.utils.check_kwargs
:members:
:undoc-members:
:show-inheritance:
paddlespeech.audio.utils.dynamic\_import module
===============================================
.. automodule:: paddlespeech.audio.utils.dynamic_import
:members:
:undoc-members:
:show-inheritance:
......@@ -12,8 +12,11 @@ Submodules
.. toctree::
:maxdepth: 4
paddlespeech.audio.utils.check_kwargs
paddlespeech.audio.utils.download
paddlespeech.audio.utils.dynamic_import
paddlespeech.audio.utils.error
paddlespeech.audio.utils.log
paddlespeech.audio.utils.numeric
paddlespeech.audio.utils.tensor_utils
paddlespeech.audio.utils.time
paddlespeech.audio.utils.tensor\_utils module
=============================================
.. automodule:: paddlespeech.audio.utils.tensor_utils
:members:
:undoc-members:
:show-inheritance:
paddlespeech.kws.exps.mdtc.collate module
=========================================
.. automodule:: paddlespeech.kws.exps.mdtc.collate
:members:
:undoc-members:
:show-inheritance:
paddlespeech.kws.exps.mdtc.compute\_det module
==============================================
.. automodule:: paddlespeech.kws.exps.mdtc.compute_det
:members:
:undoc-members:
:show-inheritance:
paddlespeech.kws.exps.mdtc.plot\_det\_curve module
==================================================
.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve
:members:
:undoc-members:
:show-inheritance:
paddlespeech.kws.exps.mdtc package
==================================
.. automodule:: paddlespeech.kws.exps.mdtc
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.kws.exps.mdtc.collate
paddlespeech.kws.exps.mdtc.compute_det
paddlespeech.kws.exps.mdtc.plot_det_curve
paddlespeech.kws.exps.mdtc.score
paddlespeech.kws.exps.mdtc.train
paddlespeech.kws.exps.mdtc.score module
=======================================
.. automodule:: paddlespeech.kws.exps.mdtc.score
:members:
:undoc-members:
:show-inheritance:
paddlespeech.kws.exps.mdtc.train module
=======================================
.. automodule:: paddlespeech.kws.exps.mdtc.train
:members:
:undoc-members:
:show-inheritance:
paddlespeech.kws.exps package
=============================
.. automodule:: paddlespeech.kws.exps
:members:
:undoc-members:
:show-inheritance:
Subpackages
-----------
.. toctree::
:maxdepth: 4
paddlespeech.kws.exps.mdtc
......@@ -12,4 +12,5 @@ Subpackages
.. toctree::
:maxdepth: 4
paddlespeech.kws.exps
paddlespeech.kws.models
paddlespeech.resource.model\_alias module
=========================================
.. automodule:: paddlespeech.resource.model_alias
:members:
:undoc-members:
:show-inheritance:
paddlespeech.resource.pretrained\_models module
===============================================
.. automodule:: paddlespeech.resource.pretrained_models
:members:
:undoc-members:
:show-inheritance:
paddlespeech.resource.resource module
=====================================
.. automodule:: paddlespeech.resource.resource
:members:
:undoc-members:
:show-inheritance:
paddlespeech.resource package
=============================
.. automodule:: paddlespeech.resource
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.resource.model_alias
paddlespeech.resource.pretrained_models
paddlespeech.resource.resource
......@@ -16,8 +16,10 @@ Subpackages
paddlespeech.cli
paddlespeech.cls
paddlespeech.kws
paddlespeech.resource
paddlespeech.s2t
paddlespeech.server
paddlespeech.t2s
paddlespeech.text
paddlespeech.utils
paddlespeech.vector
......@@ -19,5 +19,4 @@ Subpackages
paddlespeech.s2t.models
paddlespeech.s2t.modules
paddlespeech.s2t.training
paddlespeech.s2t.transform
paddlespeech.s2t.utils
......@@ -18,7 +18,6 @@ Submodules
paddlespeech.server.utils.config
paddlespeech.server.utils.errors
paddlespeech.server.utils.exception
paddlespeech.server.utils.log
paddlespeech.server.utils.onnx_infer
paddlespeech.server.utils.paddle_predictor
paddlespeech.server.utils.util
......
......@@ -19,4 +19,5 @@ Submodules
paddlespeech.t2s.datasets.get_feats
paddlespeech.t2s.datasets.ljspeech
paddlespeech.t2s.datasets.preprocess_utils
paddlespeech.t2s.datasets.sampler
paddlespeech.t2s.datasets.vocoder_batch_fn
paddlespeech.t2s.datasets.sampler module
========================================
.. automodule:: paddlespeech.t2s.datasets.sampler
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.exps.ernie\_sat.align module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.align
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.exps.ernie\_sat.normalize module
=================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.normalize
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.exps.ernie\_sat.preprocess module
==================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.preprocess
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.exps.ernie\_sat package
========================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.exps.ernie_sat.align
paddlespeech.t2s.exps.ernie_sat.normalize
paddlespeech.t2s.exps.ernie_sat.preprocess
paddlespeech.t2s.exps.ernie_sat.synthesize
paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
paddlespeech.t2s.exps.ernie_sat.train
paddlespeech.t2s.exps.ernie_sat.utils
paddlespeech.t2s.exps.ernie\_sat.synthesize module
==================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.exps.ernie\_sat.synthesize\_e2e module
=======================================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.exps.ernie\_sat.train module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.train
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.exps.ernie\_sat.utils module
=============================================
.. automodule:: paddlespeech.t2s.exps.ernie_sat.utils
:members:
:undoc-members:
:show-inheritance:
......@@ -16,3 +16,4 @@ Submodules
paddlespeech.t2s.exps.fastspeech2.normalize
paddlespeech.t2s.exps.fastspeech2.preprocess
paddlespeech.t2s.exps.fastspeech2.train
paddlespeech.t2s.exps.fastspeech2.vc2_infer
paddlespeech.t2s.exps.fastspeech2.vc2\_infer module
===================================================
.. automodule:: paddlespeech.t2s.exps.fastspeech2.vc2_infer
:members:
:undoc-members:
:show-inheritance:
......@@ -12,11 +12,13 @@ Subpackages
.. toctree::
:maxdepth: 4
paddlespeech.t2s.exps.ernie_sat
paddlespeech.t2s.exps.fastspeech2
paddlespeech.t2s.exps.gan_vocoder
paddlespeech.t2s.exps.speedyspeech
paddlespeech.t2s.exps.tacotron2
paddlespeech.t2s.exps.transformer_tts
paddlespeech.t2s.exps.vits
paddlespeech.t2s.exps.waveflow
paddlespeech.t2s.exps.wavernn
......@@ -31,6 +33,7 @@ Submodules
paddlespeech.t2s.exps.ort_predict
paddlespeech.t2s.exps.ort_predict_e2e
paddlespeech.t2s.exps.ort_predict_streaming
paddlespeech.t2s.exps.stream_play_tts
paddlespeech.t2s.exps.syn_utils
paddlespeech.t2s.exps.synthesize
paddlespeech.t2s.exps.synthesize_e2e
......
paddlespeech.t2s.exps.stream\_play\_tts module
==============================================
.. automodule:: paddlespeech.t2s.exps.stream_play_tts
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.exps.vits.normalize module
===========================================
.. automodule:: paddlespeech.t2s.exps.vits.normalize
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.exps.vits.preprocess module
============================================
.. automodule:: paddlespeech.t2s.exps.vits.preprocess
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.exps.vits package
==================================
.. automodule:: paddlespeech.t2s.exps.vits
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.exps.vits.normalize
paddlespeech.t2s.exps.vits.preprocess
paddlespeech.t2s.exps.vits.synthesize
paddlespeech.t2s.exps.vits.synthesize_e2e
paddlespeech.t2s.exps.vits.train
paddlespeech.t2s.exps.vits.voice_cloning
paddlespeech.t2s.exps.vits.synthesize module
============================================
.. automodule:: paddlespeech.t2s.exps.vits.synthesize
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.exps.vits.synthesize\_e2e module
=================================================
.. automodule:: paddlespeech.t2s.exps.vits.synthesize_e2e
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.exps.vits.train module
=======================================
.. automodule:: paddlespeech.t2s.exps.vits.train
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.exps.vits.voice\_cloning module
================================================
.. automodule:: paddlespeech.t2s.exps.vits.voice_cloning
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.frontend.g2pw.dataset module
=============================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.dataset
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.frontend.g2pw.onnx\_api module
===============================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.onnx_api
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.frontend.g2pw package
======================================
.. automodule:: paddlespeech.t2s.frontend.g2pw
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.frontend.g2pw.dataset
paddlespeech.t2s.frontend.g2pw.onnx_api
paddlespeech.t2s.frontend.g2pw.utils
paddlespeech.t2s.frontend.g2pw.utils module
===========================================
.. automodule:: paddlespeech.t2s.frontend.g2pw.utils
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.frontend.mix\_frontend module
==============================================
.. automodule:: paddlespeech.t2s.frontend.mix_frontend
:members:
:undoc-members:
:show-inheritance:
......@@ -12,6 +12,7 @@ Subpackages
.. toctree::
:maxdepth: 4
paddlespeech.t2s.frontend.g2pw
paddlespeech.t2s.frontend.normalizer
paddlespeech.t2s.frontend.zh_normalization
......@@ -23,6 +24,7 @@ Submodules
paddlespeech.t2s.frontend.arpabet
paddlespeech.t2s.frontend.generate_lexicon
paddlespeech.t2s.frontend.mix_frontend
paddlespeech.t2s.frontend.phonectic
paddlespeech.t2s.frontend.punctuation
paddlespeech.t2s.frontend.tone_sandhi
......
paddlespeech.t2s.models.ernie\_sat.ernie\_sat module
====================================================
.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.models.ernie\_sat.ernie\_sat\_updater module
=============================================================
.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
:members:
:undoc-members:
:show-inheritance:
......@@ -12,4 +12,5 @@ Submodules
.. toctree::
:maxdepth: 4
paddlespeech.t2s.models.ernie_sat.mlm
paddlespeech.t2s.models.ernie_sat.ernie_sat
paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
paddlespeech.t2s.models.vits.monotonic\_align.core module
=========================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core
:members:
:undoc-members:
:show-inheritance:
paddlespeech.t2s.models.vits.monotonic\_align package
=====================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.t2s.models.vits.monotonic_align.core
paddlespeech.t2s.models.vits.monotonic_align.setup
paddlespeech.t2s.models.vits.monotonic\_align.setup module
==========================================================
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup
:members:
:undoc-members:
:show-inheritance:
paddlespeech.utils.dynamic\_import module
=========================================
.. automodule:: paddlespeech.utils.dynamic_import
:members:
:undoc-members:
:show-inheritance:
paddlespeech.utils.env module
=============================
.. automodule:: paddlespeech.utils.env
:members:
:undoc-members:
:show-inheritance:
paddlespeech.utils package
==========================
.. automodule:: paddlespeech.utils
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
paddlespeech.utils.dynamic_import
paddlespeech.utils.env
......@@ -74,8 +74,10 @@ Contents
paddlespeech.cli <api/paddlespeech.cli>
paddlespeech.cls <api/paddlespeech.cls>
paddlespeech.kws <api/paddlespeech.kws>
paddlespeech.resource <api/paddlespeech.resource>
paddlespeech.s2t <api/paddlespeech.s2t>
paddlespeech.server <api/paddlespeech.server>
paddlespeech.t2s <api/paddlespeech.t2s>
paddlespeech.text <api/paddlespeech.text>
paddlespeech.utils <api/ppaddlespeech.utils>
paddlespeech.vector <api/paddlespeech.vector>
......@@ -5,6 +5,7 @@
- [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf)
- [Polyphone Disambiguation in Mandarin Chinese with Semi-Supervised Learning](https://www.isca-speech.org/archive/pdfs/interspeech_2021/shi21d_interspeech.pdf)
* github: https://github.com/PaperMechanica/SemiPPL
- [WikipediaHomographData](https://github.com/google-research-datasets/WikipediaHomographData)
### Text Normalization
#### English
- [applenob/text_normalization](https://github.com/applenob/text_normalization)
......
# ERNIE-SAT with AISHELL3 dataset
# ERNIE-SAT with VCTK dataset
ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。
## 模型框架
ERNIE-SAT 中我们提出了两项创新:
- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
## Model Framework
In ERNIE-SAT, we propose two innovations:
- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
- The joint mask learning of speech and text is used to realize the alignment of speech and text
<p align="center">
<img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />
......
# ERNIE-SAT with AISHELL3 and VCTK dataset
# ERNIE-SAT with VCTK dataset
ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。
## 模型框架
ERNIE-SAT 中我们提出了两项创新:
- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
## Model Framework
In ERNIE-SAT, we propose two innovations:
- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
- The joint mask learning of speech and text is used to realize the alignment of speech and text
<p align="center">
<img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />
......
......@@ -75,6 +75,15 @@ When "Prepare" done. The structure of the current directory is listed below.
```
### Set finetune.yaml
`finetune.yaml` contains some configurations for fine-tuning. You can try various options to fine better result.
Arguments:
- `batch_size`: finetune batch size. Default: -1, means 64 which same to pretrained model
- `learning_rate`: learning rate. Default: 0.0001
- `num_snapshots`: number of save models. Default: -1, means 5 which same to pretrained model
- `frozen_layers`: frozen layers. must be a list. If you don't want to frozen any layer, set [].
## Get Started
Run the command below to
......
......@@ -14,6 +14,7 @@
import argparse
import os
from pathlib import Path
from typing import List
from typing import Union
import yaml
......@@ -21,10 +22,10 @@ from local.check_oov import get_check_result
from local.extract import extract_feature
from local.label_process import get_single_label
from local.prepare_env import generate_finetune_env
from local.train import train_sp
from paddle import distributed as dist
from yacs.config import CfgNode
from paddlespeech.t2s.exps.fastspeech2.train import train_sp
from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
DICT_EN = 'tools/aligner/cmudict-0.7b'
......@@ -38,15 +39,24 @@ os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
class TrainArgs():
def __init__(self, ngpu, config_file, dump_dir: Path, output_dir: Path):
def __init__(self,
ngpu,
config_file,
dump_dir: Path,
output_dir: Path,
frozen_layers: List[str]):
# config: fastspeech2 config file.
self.config = str(config_file)
self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl")
self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl")
# model output dir.
self.output_dir = str(output_dir)
self.ngpu = ngpu
self.phones_dict = str(dump_dir / "phone_id_map.txt")
self.speaker_dict = str(dump_dir / "speaker_id_map.txt")
self.voice_cloning = False
# frozen layers
self.frozen_layers = frozen_layers
def get_mfa_result(
......@@ -122,12 +132,11 @@ if __name__ == '__main__':
"--ngpu", type=int, default=2, help="if ngpu=0, use cpu.")
parser.add_argument("--epoch", type=int, default=100, help="finetune epoch")
parser.add_argument(
"--batch_size",
type=int,
default=-1,
help="batch size, default -1 means same as pretrained model")
"--finetune_config",
type=str,
default="./finetune.yaml",
help="Path to finetune config file")
args = parser.parse_args()
......@@ -147,8 +156,14 @@ if __name__ == '__main__':
with open(config_file) as f:
config = CfgNode(yaml.safe_load(f))
config.max_epoch = config.max_epoch + args.epoch
if args.batch_size > 0:
config.batch_size = args.batch_size
with open(args.finetune_config) as f2:
finetune_config = CfgNode(yaml.safe_load(f2))
config.batch_size = finetune_config.batch_size if finetune_config.batch_size > 0 else config.batch_size
config.optimizer.learning_rate = finetune_config.learning_rate if finetune_config.learning_rate > 0 else config.optimizer.learning_rate
config.num_snapshots = finetune_config.num_snapshots if finetune_config.num_snapshots > 0 else config.num_snapshots
frozen_layers = finetune_config.frozen_layers
assert type(frozen_layers) == list, "frozen_layers should be set a list."
if args.lang == 'en':
lexicon_file = DICT_EN
......@@ -158,6 +173,13 @@ if __name__ == '__main__':
mfa_phone_file = MFA_PHONE_ZH
else:
print('please input right lang!!')
print(f"finetune max_epoch: {config.max_epoch}")
print(f"finetune batch_size: {config.batch_size}")
print(f"finetune learning_rate: {config.optimizer.learning_rate}")
print(f"finetune num_snapshots: {config.num_snapshots}")
print(f"finetune frozen_layers: {frozen_layers}")
am_phone_file = pretrained_model_dir / "phone_id_map.txt"
label_file = input_dir / "labels.txt"
......@@ -181,7 +203,8 @@ if __name__ == '__main__':
generate_finetune_env(output_dir, pretrained_model_dir)
# create a new args for training
train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir)
train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir,
frozen_layers)
# finetune models
# dispatch
......
###########################################################
# PARAS SETTING #
###########################################################
# Set to -1 to indicate that the parameter is the same as the pretrained model configuration
batch_size: -1
learning_rate: 0.0001 # learning rate
num_snapshots: -1
# frozen_layers should be a list
# if you don't need to freeze, set frozen_layers to []
frozen_layers: ["encoder", "duration_predictor"]
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import math
import os
from operator import itemgetter
from pathlib import Path
......@@ -211,9 +210,9 @@ def extract_feature(duration_file: str,
mel_extractor, pitch_extractor, energy_extractor = get_extractor(config)
wav_files = sorted(list((input_dir).rglob("*.wav")))
# split data into 3 sections, train: 80%, dev: 10%, test: 10%
num_train = math.ceil(len(wav_files) * 0.8)
num_dev = math.ceil(len(wav_files) * 0.1)
# split data into 3 sections, train: len(wav_files) - 2, dev: 1, test: 1
num_train = len(wav_files) - 2
num_dev = 1
print(num_train, num_dev)
train_wav_files = wav_files[:num_train]
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import shutil
from pathlib import Path
from typing import List
import jsonlines
import numpy as np
import paddle
from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn
from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Evaluator
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Updater
from paddlespeech.t2s.training.extensions.snapshot import Snapshot
from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.optimizer import build_optimizers
from paddlespeech.t2s.training.seeding import seed_everything
from paddlespeech.t2s.training.trainer import Trainer
def freeze_layer(model, layers: List[str]):
"""freeze layers
Args:
layers (List[str]): frozen layers
"""
for layer in layers:
for param in eval("model." + layer + ".parameters()"):
param.trainable = False
def train_sp(args, config):
# decides device type and whether to run in parallel
# setup running environment correctly
if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
paddle.set_device("cpu")
else:
paddle.set_device("gpu")
world_size = paddle.distributed.get_world_size()
if world_size > 1:
paddle.distributed.init_parallel_env()
# set the random seed, it is a must for multiprocess training
seed_everything(config.seed)
print(
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
)
fields = [
"text", "text_lengths", "speech", "speech_lengths", "durations",
"pitch", "energy"
]
converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
spk_num = None
if args.speaker_dict is not None:
print("multiple speaker fastspeech2!")
collate_fn = fastspeech2_multi_spk_batch_fn
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
fields += ["spk_id"]
elif args.voice_cloning:
print("Training voice cloning!")
collate_fn = fastspeech2_multi_spk_batch_fn
fields += ["spk_emb"]
converters["spk_emb"] = np.load
else:
print("single speaker fastspeech2!")
collate_fn = fastspeech2_single_spk_batch_fn
print("spk_num:", spk_num)
# dataloader has been too verbose
logging.getLogger("DataLoader").disabled = True
# construct dataset for training and validation
with jsonlines.open(args.train_metadata, 'r') as reader:
train_metadata = list(reader)
train_dataset = DataTable(
data=train_metadata,
fields=fields,
converters=converters, )
with jsonlines.open(args.dev_metadata, 'r') as reader:
dev_metadata = list(reader)
dev_dataset = DataTable(
data=dev_metadata,
fields=fields,
converters=converters, )
# collate function and dataloader
train_sampler = DistributedBatchSampler(
train_dataset,
batch_size=config.batch_size,
shuffle=True,
drop_last=True)
print("samplers done!")
train_dataloader = DataLoader(
train_dataset,
batch_sampler=train_sampler,
collate_fn=collate_fn,
num_workers=config.num_workers)
dev_dataloader = DataLoader(
dev_dataset,
shuffle=False,
drop_last=False,
batch_size=config.batch_size,
collate_fn=collate_fn,
num_workers=config.num_workers)
print("dataloaders done!")
with open(args.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
odim = config.n_mels
model = FastSpeech2(
idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
# freeze layer
if args.frozen_layers != []:
freeze_layer(model, args.frozen_layers)
if world_size > 1:
model = DataParallel(model)
print("model done!")
optimizer = build_optimizers(model, **config["optimizer"])
print("optimizer done!")
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
if dist.get_rank() == 0:
config_name = args.config.split("/")[-1]
# copy conf to output_dir
shutil.copyfile(args.config, output_dir / config_name)
updater = FastSpeech2Updater(
model=model,
optimizer=optimizer,
dataloader=train_dataloader,
output_dir=output_dir,
**config["updater"])
trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
evaluator = FastSpeech2Evaluator(
model, dev_dataloader, output_dir=output_dir, **config["updater"])
if dist.get_rank() == 0:
trainer.extend(evaluator, trigger=(1, "epoch"))
trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
trainer.extend(
Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
trainer.run()
......@@ -10,11 +10,12 @@ mfa_dir=./mfa_result
dump_dir=./dump
output_dir=./exp/default
lang=zh
ngpu=2
ngpu=1
finetune_config=./finetune.yaml
ckpt=snapshot_iter_96600
ckpt=snapshot_iter_96699
gpus=0,1
gpus=1
CUDA_VISIBLE_DEVICES=${gpus}
stage=0
stop_stage=100
......@@ -35,7 +36,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--output_dir=${output_dir} \
--lang=${lang} \
--ngpu=${ngpu} \
--epoch=100
--epoch=100 \
--finetune_config=${finetune_config}
fi
......@@ -54,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=./test_e2e \
--output_dir=./test_e2e/ \
--phones_dict=${dump_dir}/phone_id_map.txt \
--speaker_dict=${dump_dir}/speaker_id_map.txt \
--spk_id=0
......
# ERNIE-SAT with VCTK dataset
ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。
## 模型框架
ERNIE-SAT 中我们提出了两项创新:
- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
## Model Framework
In ERNIE-SAT, we propose two innovations:
- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
- The joint mask learning of speech and text is used to realize the alignment of speech and text
<p align="center">
<img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />
......
......@@ -46,3 +46,10 @@ Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.078918 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.054401 |
| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | -1 | 0.050767 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | -1 | 0.061884 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | -1 | 0.062056 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | -1 | 0.052110 |
......@@ -27,8 +27,10 @@ def warm_up(engine_and_type: str, warm_up_time: int=3) -> bool:
sentence = "您好,欢迎使用语音合成服务。"
elif tts_engine.lang == 'en':
sentence = "Hello and welcome to the speech synthesis service."
elif tts_engine.lang == 'mix':
sentence = "您好,欢迎使用TTS多语种服务。"
else:
logger.error("tts engine only support lang: zh or en.")
logger.error("tts engine only support lang: zh or en or mix.")
sys.exit(-1)
if engine_and_type == "tts_python":
......
from paddlespeech.t2s.frontend.g2pw.onnx_api import G2PWOnnxConverter
from .onnx_api import G2PWOnnxConverter
......@@ -15,6 +15,10 @@
Credits
This code is modified from https://github.com/GitYCC/g2pW
"""
from typing import Dict
from typing import List
from typing import Tuple
import numpy as np
from paddlespeech.t2s.frontend.g2pw.utils import tokenize_and_map
......@@ -23,22 +27,17 @@ ANCHOR_CHAR = '▁'
def prepare_onnx_input(tokenizer,
labels,
char2phonemes,
chars,
texts,
query_ids,
phonemes=None,
pos_tags=None,
use_mask=False,
use_char_phoneme=False,
use_pos=False,
window_size=None,
max_len=512):
labels: List[str],
char2phonemes: Dict[str, List[int]],
chars: List[str],
texts: List[str],
query_ids: List[int],
use_mask: bool=False,
window_size: int=None,
max_len: int=512) -> Dict[str, np.array]:
if window_size is not None:
truncated_texts, truncated_query_ids = _truncate_texts(window_size,
texts, query_ids)
truncated_texts, truncated_query_ids = _truncate_texts(
window_size=window_size, texts=texts, query_ids=query_ids)
input_ids = []
token_type_ids = []
attention_masks = []
......@@ -51,13 +50,19 @@ def prepare_onnx_input(tokenizer,
query_id = (truncated_query_ids if window_size else query_ids)[idx]
try:
tokens, text2token, token2text = tokenize_and_map(tokenizer, text)
tokens, text2token, token2text = tokenize_and_map(
tokenizer=tokenizer, text=text)
except Exception:
print(f'warning: text "{text}" is invalid')
return {}
text, query_id, tokens, text2token, token2text = _truncate(
max_len, text, query_id, tokens, text2token, token2text)
max_len=max_len,
text=text,
query_id=query_id,
tokens=tokens,
text2token=text2token,
token2text=token2text)
processed_tokens = ['[CLS]'] + tokens + ['[SEP]']
......@@ -91,7 +96,8 @@ def prepare_onnx_input(tokenizer,
return outputs
def _truncate_texts(window_size, texts, query_ids):
def _truncate_texts(window_size: int, texts: List[str],
query_ids: List[int]) -> Tuple[List[str], List[int]]:
truncated_texts = []
truncated_query_ids = []
for text, query_id in zip(texts, query_ids):
......@@ -105,7 +111,12 @@ def _truncate_texts(window_size, texts, query_ids):
return truncated_texts, truncated_query_ids
def _truncate(max_len, text, query_id, tokens, text2token, token2text):
def _truncate(max_len: int,
text: str,
query_id: int,
tokens: List[str],
text2token: List[int],
token2text: List[Tuple[int]]):
truncate_len = max_len - 2
if len(tokens) <= truncate_len:
return (text, query_id, tokens, text2token, token2text)
......@@ -132,18 +143,8 @@ def _truncate(max_len, text, query_id, tokens, text2token, token2text):
], [(s - start, e - start) for s, e in token2text[token_start:token_end]])
def prepare_data(sent_path, lb_path=None):
raw_texts = open(sent_path).read().rstrip().split('\n')
query_ids = [raw.index(ANCHOR_CHAR) for raw in raw_texts]
texts = [raw.replace(ANCHOR_CHAR, '') for raw in raw_texts]
if lb_path is None:
return texts, query_ids
else:
phonemes = open(lb_path).read().rstrip().split('\n')
return texts, query_ids, phonemes
def get_phoneme_labels(polyphonic_chars):
def get_phoneme_labels(polyphonic_chars: List[List[str]]
) -> Tuple[List[str], Dict[str, List[int]]]:
labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars])))
char2phonemes = {}
for char, phoneme in polyphonic_chars:
......@@ -153,7 +154,8 @@ def get_phoneme_labels(polyphonic_chars):
return labels, char2phonemes
def get_char_phoneme_labels(polyphonic_chars):
def get_char_phoneme_labels(polyphonic_chars: List[List[str]]
) -> Tuple[List[str], Dict[str, List[int]]]:
labels = sorted(
list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars])))
char2phonemes = {}
......
......@@ -17,6 +17,10 @@ Credits
"""
import json
import os
from typing import Any
from typing import Dict
from typing import List
from typing import Tuple
import numpy as np
import onnxruntime
......@@ -37,7 +41,8 @@ from paddlespeech.utils.env import MODEL_HOME
model_version = '1.1'
def predict(session, onnx_input, labels):
def predict(session, onnx_input: Dict[str, Any],
labels: List[str]) -> Tuple[List[str], List[float]]:
all_preds = []
all_confidences = []
probs = session.run([], {
......@@ -61,10 +66,10 @@ def predict(session, onnx_input, labels):
class G2PWOnnxConverter:
def __init__(self,
model_dir=MODEL_HOME,
style='bopomofo',
model_source=None,
enable_non_tradional_chinese=False):
model_dir: os.PathLike=MODEL_HOME,
style: str='bopomofo',
model_source: str=None,
enable_non_tradional_chinese: bool=False):
uncompress_path = download_and_decompress(
g2pw_onnx_models['G2PWModel'][model_version], model_dir)
......@@ -76,7 +81,8 @@ class G2PWOnnxConverter:
os.path.join(uncompress_path, 'g2pW.onnx'),
sess_options=sess_options)
self.config = load_config(
os.path.join(uncompress_path, 'config.py'), use_default=True)
config_path=os.path.join(uncompress_path, 'config.py'),
use_default=True)
self.model_source = model_source if model_source else self.config.model_source
self.enable_opencc = enable_non_tradional_chinese
......@@ -103,9 +109,9 @@ class G2PWOnnxConverter:
.strip().split('\n')
]
self.labels, self.char2phonemes = get_char_phoneme_labels(
self.polyphonic_chars
polyphonic_chars=self.polyphonic_chars
) if self.config.use_char_phoneme else get_phoneme_labels(
self.polyphonic_chars)
polyphonic_chars=self.polyphonic_chars)
self.chars = sorted(list(self.char2phonemes.keys()))
......@@ -146,7 +152,7 @@ class G2PWOnnxConverter:
if self.enable_opencc:
self.cc = OpenCC('s2tw')
def _convert_bopomofo_to_pinyin(self, bopomofo):
def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
tone = bopomofo[-1]
assert tone in '12345'
component = self.bopomofo_convert_dict.get(bopomofo[:-1])
......@@ -156,7 +162,7 @@ class G2PWOnnxConverter:
print(f'Warning: "{bopomofo}" cannot convert to pinyin')
return None
def __call__(self, sentences):
def __call__(self, sentences: List[str]) -> List[List[str]]:
if isinstance(sentences, str):
sentences = [sentences]
......@@ -169,23 +175,25 @@ class G2PWOnnxConverter:
sentences = translated_sentences
texts, query_ids, sent_ids, partial_results = self._prepare_data(
sentences)
sentences=sentences)
if len(texts) == 0:
# sentences no polyphonic words
return partial_results
onnx_input = prepare_onnx_input(
self.tokenizer,
self.labels,
self.char2phonemes,
self.chars,
texts,
query_ids,
tokenizer=self.tokenizer,
labels=self.labels,
char2phonemes=self.char2phonemes,
chars=self.chars,
texts=texts,
query_ids=query_ids,
use_mask=self.config.use_mask,
use_char_phoneme=self.config.use_char_phoneme,
window_size=None)
preds, confidences = predict(self.session_g2pW, onnx_input, self.labels)
preds, confidences = predict(
session=self.session_g2pW,
onnx_input=onnx_input,
labels=self.labels)
if self.config.use_char_phoneme:
preds = [pred.split(' ')[1] for pred in preds]
......@@ -195,7 +203,9 @@ class G2PWOnnxConverter:
return results
def _prepare_data(self, sentences):
def _prepare_data(
self, sentences: List[str]
) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
texts, query_ids, sent_ids, partial_results = [], [], [], []
for sent_id, sent in enumerate(sentences):
# pypinyin works well for Simplified Chinese than Traditional Chinese
......
......@@ -15,10 +15,11 @@
Credits
This code is modified from https://github.com/GitYCC/g2pW
"""
import os
import re
def wordize_and_map(text):
def wordize_and_map(text: str):
words = []
index_map_from_text_to_word = []
index_map_from_word_to_text = []
......@@ -54,8 +55,8 @@ def wordize_and_map(text):
return words, index_map_from_text_to_word, index_map_from_word_to_text
def tokenize_and_map(tokenizer, text):
words, text2word, word2text = wordize_and_map(text)
def tokenize_and_map(tokenizer, text: str):
words, text2word, word2text = wordize_and_map(text=text)
tokens = []
index_map_from_token_to_text = []
......@@ -82,7 +83,7 @@ def tokenize_and_map(tokenizer, text):
return tokens, index_map_from_text_to_token, index_map_from_token_to_text
def _load_config(config_path):
def _load_config(config_path: os.PathLike):
import importlib.util
spec = importlib.util.spec_from_file_location('__init__', config_path)
config = importlib.util.module_from_spec(spec)
......@@ -130,7 +131,7 @@ default_config_dict = {
}
def load_config(config_path, use_default=False):
def load_config(config_path: os.PathLike, use_default: bool=False):
config = _load_config(config_path)
if use_default:
for attr, val in default_config_dict.items():
......
......@@ -71,31 +71,53 @@ class MLMEncoder(nn.Layer):
"""Conformer encoder module.
Args:
idim (int): Input dimension.
attention_dim (int): Dimension of attention.
attention_heads (int): The number of heads of multi head attention.
linear_units (int): The number of units of position-wise feed forward.
num_blocks (int): The number of decoder blocks.
dropout_rate (float): Dropout rate.
positional_dropout_rate (float): Dropout rate after adding positional encoding.
attention_dropout_rate (float): Dropout rate in attention.
input_layer (Union[str, paddle.nn.Layer]): Input layer type.
normalize_before (bool): Whether to use layer_norm before the first block.
concat_after (bool): Whether to concat attention layer's input and output.
idim (int):
Input dimension.
attention_dim (int):
Dimension of attention.
attention_heads (int):
The number of heads of multi head attention.
linear_units (int):
The number of units of position-wise feed forward.
num_blocks (int):
The number of decoder blocks.
dropout_rate (float):
Dropout rate.
positional_dropout_rate (float):
Dropout rate after adding positional encoding.
attention_dropout_rate (float):
Dropout rate in attention.
input_layer (Union[str, paddle.nn.Layer]):
Input layer type.
normalize_before (bool):
Whether to use layer_norm before the first block.
concat_after (bool):
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
macaron_style (bool): Whether to use macaron style for positionwise layer.
pos_enc_layer_type (str): Encoder positional encoding layer type.
selfattention_layer_type (str): Encoder attention layer type.
activation_type (str): Encoder activation function type.
use_cnn_module (bool): Whether to use convolution module.
zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel (int): Kernerl size of convolution module.
padding_idx (int): Padding idx for input_layer=embed.
stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
positionwise_layer_type (str):
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size (int):
Kernel size of positionwise conv1d layer.
macaron_style (bool):
Whether to use macaron style for positionwise layer.
pos_enc_layer_type (str):
Encoder positional encoding layer type.
selfattention_layer_type (str):
Encoder attention layer type.
activation_type (str):
Encoder activation function type.
use_cnn_module (bool):
Whether to use convolution module.
zero_triu (bool):
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel (int):
Kernerl size of convolution module.
padding_idx (int):
Padding idx for input_layer=embed.
stochastic_depth_rate (float):
Maximum probability to skip the encoder layer.
"""
......@@ -320,12 +342,16 @@ class MLMDecoder(MLMEncoder):
"""Encode input sequence.
Args:
xs (paddle.Tensor): Input tensor (#batch, time, idim).
masks (paddle.Tensor): Mask tensor (#batch, time).
xs (paddle.Tensor):
Input tensor (#batch, time, idim).
masks (paddle.Tensor):
Mask tensor (#batch, time).
Returns:
paddle.Tensor: Output tensor (#batch, time, attention_dim).
paddle.Tensor: Mask tensor (#batch, time).
paddle.Tensor:
Output tensor (#batch, time, attention_dim).
paddle.Tensor:
Mask tensor (#batch, time).
"""
xs = self.embed(xs)
......@@ -392,19 +418,27 @@ class MLM(nn.Layer):
use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]:
'''
Args:
speech (paddle.Tensor): input speech (1, Tmax, D).
text (paddle.Tensor): input text (1, Tmax2).
masked_pos (paddle.Tensor): masked position of input speech (1, Tmax)
speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax).
text_mask (paddle.Tensor): mask of text (1, 1, Tmax2).
speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
span_bdy (List[int]): masked mel boundary of input speech (2,)
use_teacher_forcing (bool): whether to use teacher forcing
speech (paddle.Tensor):
input speech (1, Tmax, D).
text (paddle.Tensor):
input text (1, Tmax2).
masked_pos (paddle.Tensor):
masked position of input speech (1, Tmax)
speech_mask (paddle.Tensor):
mask of speech (1, 1, Tmax).
text_mask (paddle.Tensor):
mask of text (1, 1, Tmax2).
speech_seg_pos (paddle.Tensor):
n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
text_seg_pos (paddle.Tensor):
n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
span_bdy (List[int]):
masked mel boundary of input speech (2,)
use_teacher_forcing (bool):
whether to use teacher forcing
Returns:
List[Tensor]:
eg:
[Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
eg: [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
'''
z_cache = None
......
......@@ -48,12 +48,18 @@ class StochasticDurationPredictor(nn.Layer):
global_channels: int=-1, ):
"""Initialize StochasticDurationPredictor module.
Args:
channels (int): Number of channels.
kernel_size (int): Kernel size.
dropout_rate (float): Dropout rate.
flows (int): Number of flows.
dds_conv_layers (int): Number of conv layers in DDS conv.
global_channels (int): Number of global conditioning channels.
channels (int):
Number of channels.
kernel_size (int):
Kernel size.
dropout_rate (float):
Dropout rate.
flows (int):
Number of flows.
dds_conv_layers (int):
Number of conv layers in DDS conv.
global_channels (int):
Number of global conditioning channels.
"""
super().__init__()
......@@ -108,14 +114,21 @@ class StochasticDurationPredictor(nn.Layer):
noise_scale: float=1.0, ) -> paddle.Tensor:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T_text).
x_mask (Tensor): Mask tensor (B, 1, T_text).
w (Optional[Tensor]): Duration tensor (B, 1, T_text).
g (Optional[Tensor]): Global conditioning tensor (B, channels, 1)
inverse (bool): Whether to inverse the flow.
noise_scale (float): Noise scale value.
x (Tensor):
Input tensor (B, channels, T_text).
x_mask (Tensor):
Mask tensor (B, 1, T_text).
w (Optional[Tensor]):
Duration tensor (B, 1, T_text).
g (Optional[Tensor]):
Global conditioning tensor (B, channels, 1)
inverse (bool):
Whether to inverse the flow.
noise_scale (float):
Noise scale value.
Returns:
Tensor: If not inverse, negative log-likelihood (NLL) tensor (B,).
Tensor:
If not inverse, negative log-likelihood (NLL) tensor (B,).
If inverse, log-duration tensor (B, 1, T_text).
"""
# stop gradient
......
......@@ -34,11 +34,15 @@ class FlipFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, channels, T).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Flipped tensor (B, channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Flipped tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
x = paddle.flip(x, [1])
if not inverse:
......@@ -60,13 +64,19 @@ class LogFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
x_mask (Tensor): Mask tensor (B, 1, T).
inverse (bool): Whether to inverse the flow.
eps (float): Epsilon for log.
x (Tensor):
Input tensor (B, channels, T).
x_mask (Tensor):
Mask tensor (B, 1, T).
inverse (bool):
Whether to inverse the flow.
eps (float):
Epsilon for log.
Returns:
Tensor: Output tensor (B, channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Output tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
if not inverse:
y = paddle.log(paddle.clip(x, min=eps)) * x_mask
......@@ -83,7 +93,8 @@ class ElementwiseAffineFlow(nn.Layer):
def __init__(self, channels: int):
"""Initialize ElementwiseAffineFlow module.
Args:
channels (int): Number of channels.
channels (int):
Number of channels.
"""
super().__init__()
self.channels = channels
......@@ -107,12 +118,17 @@ class ElementwiseAffineFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
x_mask (Tensor): Mask tensor (B, 1, T).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, channels, T).
x_mask (Tensor):
Mask tensor (B, 1, T).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Output tensor (B, channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Output tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
if not inverse:
y = self.m + paddle.exp(self.logs) * x
......@@ -157,11 +173,16 @@ class DilatedDepthSeparableConv(nn.Layer):
eps: float=1e-5, ):
"""Initialize DilatedDepthSeparableConv module.
Args:
channels (int): Number of channels.
kernel_size (int): Kernel size.
layers (int): Number of layers.
dropout_rate (float): Dropout rate.
eps (float): Epsilon for layer norm.
channels (int):
Number of channels.
kernel_size (int):
Kernel size.
layers (int):
Number of layers.
dropout_rate (float):
Dropout rate.
eps (float):
Epsilon for layer norm.
"""
super().__init__()
......@@ -198,11 +219,15 @@ class DilatedDepthSeparableConv(nn.Layer):
g: Optional[paddle.Tensor]=None) -> paddle.Tensor:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T).
x_mask (Tensor): Mask tensor (B, 1, T).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
x (Tensor):
Input tensor (B, in_channels, T).
x_mask (Tensor):
Mask tensor (B, 1, T).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
Returns:
Tensor: Output tensor (B, channels, T).
Tensor:
Output tensor (B, channels, T).
"""
if g is not None:
x = x + g
......@@ -225,12 +250,18 @@ class ConvFlow(nn.Layer):
tail_bound: float=5.0, ):
"""Initialize ConvFlow module.
Args:
in_channels (int): Number of input channels.
hidden_channels (int): Number of hidden channels.
kernel_size (int): Kernel size.
layers (int): Number of layers.
bins (int): Number of bins.
tail_bound (float): Tail bound value.
in_channels (int):
Number of input channels.
hidden_channels (int):
Number of hidden channels.
kernel_size (int):
Kernel size.
layers (int):
Number of layers.
bins (int):
Number of bins.
tail_bound (float):
Tail bound value.
"""
super().__init__()
self.half_channels = in_channels // 2
......@@ -275,13 +306,19 @@ class ConvFlow(nn.Layer):
) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
x_mask (Tensor): Mask tensor (B, 1, T).
g (Optional[Tensor]): Global conditioning tensor (B, channels, 1).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, channels, T).
x_mask (Tensor):
Mask tensor (B, 1, T).
g (Optional[Tensor]):
Global conditioning tensor (B, channels, 1).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Output tensor (B, channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Output tensor (B, channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
xa, xb = x.split(2, 1)
h = self.input_conv(xa)
......
......@@ -97,81 +97,104 @@ class VITSGenerator(nn.Layer):
stochastic_duration_predictor_dds_conv_layers: int=3, ):
"""Initialize VITS generator module.
Args:
vocabs (int): Input vocabulary size.
aux_channels (int): Number of acoustic feature channels.
hidden_channels (int): Number of hidden channels.
spks (Optional[int]): Number of speakers. If set to > 1, assume that the
vocabs (int):
Input vocabulary size.
aux_channels (int):
Number of acoustic feature channels.
hidden_channels (int):
Number of hidden channels.
spks (Optional[int]):
Number of speakers. If set to > 1, assume that the
sids will be provided as the input and use sid embedding layer.
langs (Optional[int]): Number of languages. If set to > 1, assume that the
langs (Optional[int]):
Number of languages. If set to > 1, assume that the
lids will be provided as the input and use sid embedding layer.
spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
spk_embed_dim (Optional[int]):
Speaker embedding dimension. If set to > 0,
assume that spembs will be provided as the input.
global_channels (int): Number of global conditioning channels.
segment_size (int): Segment size for decoder.
text_encoder_attention_heads (int): Number of heads in conformer block
of text encoder.
text_encoder_ffn_expand (int): Expansion ratio of FFN in conformer block
of text encoder.
text_encoder_blocks (int): Number of conformer blocks in text encoder.
text_encoder_positionwise_layer_type (str): Position-wise layer type in
conformer block of text encoder.
text_encoder_positionwise_conv_kernel_size (int): Position-wise convolution
kernel size in conformer block of text encoder. Only used when the
above layer type is conv1d or conv1d-linear.
text_encoder_positional_encoding_layer_type (str): Positional encoding layer
type in conformer block of text encoder.
text_encoder_self_attention_layer_type (str): Self-attention layer type in
conformer block of text encoder.
text_encoder_activation_type (str): Activation function type in conformer
block of text encoder.
text_encoder_normalize_before (bool): Whether to apply layer norm before
self-attention in conformer block of text encoder.
text_encoder_dropout_rate (float): Dropout rate in conformer block of
text encoder.
text_encoder_positional_dropout_rate (float): Dropout rate for positional
encoding in conformer block of text encoder.
text_encoder_attention_dropout_rate (float): Dropout rate for attention in
conformer block of text encoder.
text_encoder_conformer_kernel_size (int): Conformer conv kernel size. It
will be used when only use_conformer_conv_in_text_encoder = True.
use_macaron_style_in_text_encoder (bool): Whether to use macaron style FFN
in conformer block of text encoder.
use_conformer_conv_in_text_encoder (bool): Whether to use covolution in
conformer block of text encoder.
decoder_kernel_size (int): Decoder kernel size.
decoder_channels (int): Number of decoder initial channels.
decoder_upsample_scales (List[int]): List of upsampling scales in decoder.
decoder_upsample_kernel_sizes (List[int]): List of kernel size for
upsampling layers in decoder.
decoder_resblock_kernel_sizes (List[int]): List of kernel size for resblocks
in decoder.
decoder_resblock_dilations (List[List[int]]): List of list of dilations for
resblocks in decoder.
use_weight_norm_in_decoder (bool): Whether to apply weight normalization in
decoder.
posterior_encoder_kernel_size (int): Posterior encoder kernel size.
posterior_encoder_layers (int): Number of layers of posterior encoder.
posterior_encoder_stacks (int): Number of stacks of posterior encoder.
posterior_encoder_base_dilation (int): Base dilation of posterior encoder.
posterior_encoder_dropout_rate (float): Dropout rate for posterior encoder.
use_weight_norm_in_posterior_encoder (bool): Whether to apply weight
normalization in posterior encoder.
flow_flows (int): Number of flows in flow.
flow_kernel_size (int): Kernel size in flow.
flow_base_dilation (int): Base dilation in flow.
flow_layers (int): Number of layers in flow.
flow_dropout_rate (float): Dropout rate in flow
use_weight_norm_in_flow (bool): Whether to apply weight normalization in
flow.
use_only_mean_in_flow (bool): Whether to use only mean in flow.
stochastic_duration_predictor_kernel_size (int): Kernel size in stochastic
duration predictor.
stochastic_duration_predictor_dropout_rate (float): Dropout rate in
stochastic duration predictor.
stochastic_duration_predictor_flows (int): Number of flows in stochastic
duration predictor.
stochastic_duration_predictor_dds_conv_layers (int): Number of DDS conv
layers in stochastic duration predictor.
global_channels (int):
Number of global conditioning channels.
segment_size (int):
Segment size for decoder.
text_encoder_attention_heads (int):
Number of heads in conformer block of text encoder.
text_encoder_ffn_expand (int):
Expansion ratio of FFN in conformer block of text encoder.
text_encoder_blocks (int):
Number of conformer blocks in text encoder.
text_encoder_positionwise_layer_type (str):
Position-wise layer type in conformer block of text encoder.
text_encoder_positionwise_conv_kernel_size (int):
Position-wise convolution kernel size in conformer block of text encoder.
Only used when the above layer type is conv1d or conv1d-linear.
text_encoder_positional_encoding_layer_type (str):
Positional encoding layer type in conformer block of text encoder.
text_encoder_self_attention_layer_type (str):
Self-attention layer type in conformer block of text encoder.
text_encoder_activation_type (str):
Activation function type in conformer block of text encoder.
text_encoder_normalize_before (bool):
Whether to apply layer norm before self-attention in conformer block of text encoder.
text_encoder_dropout_rate (float):
Dropout rate in conformer block of text encoder.
text_encoder_positional_dropout_rate (float):
Dropout rate for positional encoding in conformer block of text encoder.
text_encoder_attention_dropout_rate (float):
Dropout rate for attention in conformer block of text encoder.
text_encoder_conformer_kernel_size (int):
Conformer conv kernel size. It will be used when only use_conformer_conv_in_text_encoder = True.
use_macaron_style_in_text_encoder (bool):
Whether to use macaron style FFN in conformer block of text encoder.
use_conformer_conv_in_text_encoder (bool):
Whether to use covolution in conformer block of text encoder.
decoder_kernel_size (int):
Decoder kernel size.
decoder_channels (int):
Number of decoder initial channels.
decoder_upsample_scales (List[int]):
List of upsampling scales in decoder.
decoder_upsample_kernel_sizes (List[int]):
List of kernel size for upsampling layers in decoder.
decoder_resblock_kernel_sizes (List[int]):
List of kernel size for resblocks in decoder.
decoder_resblock_dilations (List[List[int]]):
List of list of dilations for resblocks in decoder.
use_weight_norm_in_decoder (bool):
Whether to apply weight normalization in decoder.
posterior_encoder_kernel_size (int):
Posterior encoder kernel size.
posterior_encoder_layers (int):
Number of layers of posterior encoder.
posterior_encoder_stacks (int):
Number of stacks of posterior encoder.
posterior_encoder_base_dilation (int):
Base dilation of posterior encoder.
posterior_encoder_dropout_rate (float):
Dropout rate for posterior encoder.
use_weight_norm_in_posterior_encoder (bool):
Whether to apply weight normalization in posterior encoder.
flow_flows (int):
Number of flows in flow.
flow_kernel_size (int):
Kernel size in flow.
flow_base_dilation (int):
Base dilation in flow.
flow_layers (int):
Number of layers in flow.
flow_dropout_rate (float):
Dropout rate in flow
use_weight_norm_in_flow (bool):
Whether to apply weight normalization in flow.
use_only_mean_in_flow (bool):
Whether to use only mean in flow.
stochastic_duration_predictor_kernel_size (int):
Kernel size in stochastic duration predictor.
stochastic_duration_predictor_dropout_rate (float):
Dropout rate in stochastic duration predictor.
stochastic_duration_predictor_flows (int):
Number of flows in stochastic duration predictor.
stochastic_duration_predictor_dds_conv_layers (int):
Number of DDS conv layers in stochastic duration predictor.
"""
super().__init__()
self.segment_size = segment_size
......@@ -272,27 +295,40 @@ class VITSGenerator(nn.Layer):
paddle.Tensor, paddle.Tensor, ], ]:
"""Calculate forward propagation.
Args:
text (Tensor): Text index tensor (B, T_text).
text_lengths (Tensor): Text length tensor (B,).
feats (Tensor): Feature tensor (B, aux_channels, T_feats).
feats_lengths (Tensor): Feature length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
text (Tensor):
Text index tensor (B, T_text).
text_lengths (Tensor):
Text length tensor (B,).
feats (Tensor):
Feature tensor (B, aux_channels, T_feats).
feats_lengths (Tensor):
Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
Returns:
Tensor: Waveform tensor (B, 1, segment_size * upsample_factor).
Tensor: Duration negative log-likelihood (NLL) tensor (B,).
Tensor: Monotonic attention weight tensor (B, 1, T_feats, T_text).
Tensor: Segments start index tensor (B,).
Tensor: Text mask tensor (B, 1, T_text).
Tensor: Feature mask tensor (B, 1, T_feats).
tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
- Tensor: Posterior encoder hidden representation (B, H, T_feats).
- Tensor: Flow hidden representation (B, H, T_feats).
- Tensor: Expanded text encoder projected mean (B, H, T_feats).
- Tensor: Expanded text encoder projected scale (B, H, T_feats).
- Tensor: Posterior encoder projected mean (B, H, T_feats).
- Tensor: Posterior encoder projected scale (B, H, T_feats).
Tensor:
Waveform tensor (B, 1, segment_size * upsample_factor).
Tensor:
Duration negative log-likelihood (NLL) tensor (B,).
Tensor:
Monotonic attention weight tensor (B, 1, T_feats, T_text).
Tensor:
Segments start index tensor (B,).
Tensor:
Text mask tensor (B, 1, T_text).
Tensor:
Feature mask tensor (B, 1, T_feats).
tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
- Tensor: Posterior encoder hidden representation (B, H, T_feats).
- Tensor: Flow hidden representation (B, H, T_feats).
- Tensor: Expanded text encoder projected mean (B, H, T_feats).
- Tensor: Expanded text encoder projected scale (B, H, T_feats).
- Tensor: Posterior encoder projected mean (B, H, T_feats).
- Tensor: Posterior encoder projected scale (B, H, T_feats).
"""
# forward text encoder
x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths)
......@@ -402,24 +438,40 @@ class VITSGenerator(nn.Layer):
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Run inference.
Args:
text (Tensor): Input text index tensor (B, T_text,).
text_lengths (Tensor): Text length tensor (B,).
feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
feats_lengths (Tensor): Feature length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
dur (Optional[Tensor]): Ground-truth duration (B, T_text,). If provided,
text (Tensor):
Input text index tensor (B, T_text,).
text_lengths (Tensor):
Text length tensor (B,).
feats (Tensor):
Feature tensor (B, aux_channels, T_feats,).
feats_lengths (Tensor):
Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
dur (Optional[Tensor]):
Ground-truth duration (B, T_text,). If provided,
skip the prediction of durations (i.e., teacher forcing).
noise_scale (float): Noise scale parameter for flow.
noise_scale_dur (float): Noise scale parameter for duration predictor.
alpha (float): Alpha parameter to control the speed of generated speech.
max_len (Optional[int]): Maximum length of acoustic feature sequence.
use_teacher_forcing (bool): Whether to use teacher forcing.
noise_scale (float):
Noise scale parameter for flow.
noise_scale_dur (float):
Noise scale parameter for duration predictor.
alpha (float):
Alpha parameter to control the speed of generated speech.
max_len (Optional[int]):
Maximum length of acoustic feature sequence.
use_teacher_forcing (bool):
Whether to use teacher forcing.
Returns:
Tensor: Generated waveform tensor (B, T_wav).
Tensor: Monotonic attention weight tensor (B, T_feats, T_text).
Tensor: Duration tensor (B, T_text).
Tensor:
Generated waveform tensor (B, T_wav).
Tensor:
Monotonic attention weight tensor (B, T_feats, T_text).
Tensor:
Duration tensor (B, T_text).
"""
# encoder
x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths)
......@@ -533,15 +585,23 @@ class VITSGenerator(nn.Layer):
lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
"""Run voice conversion.
Args:
feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
feats_lengths (Tensor): Feature length tensor (B,).
sids_src (Optional[Tensor]): Speaker index tensor of source feature (B,) or (B, 1).
sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (B,) or (B, 1).
spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (B, spk_embed_dim).
spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
feats (Tensor):
Feature tensor (B, aux_channels, T_feats,).
feats_lengths (Tensor):
Feature length tensor (B,).
sids_src (Optional[Tensor]):
Speaker index tensor of source feature (B,) or (B, 1).
sids_tgt (Optional[Tensor]):
Speaker index tensor of target feature (B,) or (B, 1).
spembs_src (Optional[Tensor]):
Speaker embedding tensor of source feature (B, spk_embed_dim).
spembs_tgt (Optional[Tensor]):
Speaker embedding tensor of target feature (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
Returns:
Tensor: Generated waveform tensor (B, T_wav).
Tensor:
Generated waveform tensor (B, T_wav).
"""
# encoder
g_src = None
......@@ -602,10 +662,13 @@ class VITSGenerator(nn.Layer):
mask: paddle.Tensor) -> paddle.Tensor:
"""Generate path a.k.a. monotonic attention.
Args:
dur (Tensor): Duration tensor (B, 1, T_text).
mask (Tensor): Attention mask tensor (B, 1, T_feats, T_text).
dur (Tensor):
Duration tensor (B, 1, T_text).
mask (Tensor):
Attention mask tensor (B, 1, T_feats, T_text).
Returns:
Tensor: Path tensor (B, 1, T_feats, T_text).
Tensor:
Path tensor (B, 1, T_feats, T_text).
"""
b, _, t_y, t_x = paddle.shape(mask)
cum_dur = paddle.cumsum(dur, -1)
......
......@@ -52,17 +52,28 @@ class PosteriorEncoder(nn.Layer):
"""Initilialize PosteriorEncoder module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
hidden_channels (int): Number of hidden channels.
kernel_size (int): Kernel size in WaveNet.
layers (int): Number of layers of WaveNet.
stacks (int): Number of repeat stacking of WaveNet.
base_dilation (int): Base dilation factor.
global_channels (int): Number of global conditioning channels.
dropout_rate (float): Dropout rate.
bias (bool): Whether to use bias parameters in conv.
use_weight_norm (bool): Whether to apply weight norm.
in_channels (int):
Number of input channels.
out_channels (int):
Number of output channels.
hidden_channels (int):
Number of hidden channels.
kernel_size (int):
Kernel size in WaveNet.
layers (int):
Number of layers of WaveNet.
stacks (int):
Number of repeat stacking of WaveNet.
base_dilation (int):
Base dilation factor.
global_channels (int):
Number of global conditioning channels.
dropout_rate (float):
Dropout rate.
bias (bool):
Whether to use bias parameters in conv.
use_weight_norm (bool):
Whether to apply weight norm.
"""
super().__init__()
......@@ -99,15 +110,22 @@ class PosteriorEncoder(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T_feats).
x_lengths (Tensor): Length tensor (B,).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
x (Tensor):
Input tensor (B, in_channels, T_feats).
x_lengths (Tensor):
Length tensor (B,).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
Returns:
Tensor: Encoded hidden representation tensor (B, out_channels, T_feats).
Tensor: Projected mean tensor (B, out_channels, T_feats).
Tensor: Projected scale tensor (B, out_channels, T_feats).
Tensor: Mask tensor for input tensor (B, 1, T_feats).
Tensor:
Encoded hidden representation tensor (B, out_channels, T_feats).
Tensor:
Projected mean tensor (B, out_channels, T_feats).
Tensor:
Projected scale tensor (B, out_channels, T_feats).
Tensor:
Mask tensor for input tensor (B, 1, T_feats).
"""
x_mask = make_non_pad_mask(x_lengths).unsqueeze(1)
......
......@@ -55,18 +55,30 @@ class ResidualAffineCouplingBlock(nn.Layer):
"""Initilize ResidualAffineCouplingBlock module.
Args:
in_channels (int): Number of input channels.
hidden_channels (int): Number of hidden channels.
flows (int): Number of flows.
kernel_size (int): Kernel size for WaveNet.
base_dilation (int): Base dilation factor for WaveNet.
layers (int): Number of layers of WaveNet.
stacks (int): Number of stacks of WaveNet.
global_channels (int): Number of global channels.
dropout_rate (float): Dropout rate.
use_weight_norm (bool): Whether to use weight normalization in WaveNet.
bias (bool): Whether to use bias paramters in WaveNet.
use_only_mean (bool): Whether to estimate only mean.
in_channels (int):
Number of input channels.
hidden_channels (int):
Number of hidden channels.
flows (int):
Number of flows.
kernel_size (int):
Kernel size for WaveNet.
base_dilation (int):
Base dilation factor for WaveNet.
layers (int):
Number of layers of WaveNet.
stacks (int):
Number of stacks of WaveNet.
global_channels (int):
Number of global channels.
dropout_rate (float):
Dropout rate.
use_weight_norm (bool):
Whether to use weight normalization in WaveNet.
bias (bool):
Whether to use bias paramters in WaveNet.
use_only_mean (bool):
Whether to estimate only mean.
"""
super().__init__()
......@@ -97,10 +109,14 @@ class ResidualAffineCouplingBlock(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T).
x_mask (Tensor): Length tensor (B, 1, T).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, in_channels, T).
x_mask (Tensor):
Length tensor (B, 1, T).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Output tensor (B, in_channels, T).
......@@ -134,17 +150,28 @@ class ResidualAffineCouplingLayer(nn.Layer):
"""Initialzie ResidualAffineCouplingLayer module.
Args:
in_channels (int): Number of input channels.
hidden_channels (int): Number of hidden channels.
kernel_size (int): Kernel size for WaveNet.
base_dilation (int): Base dilation factor for WaveNet.
layers (int): Number of layers of WaveNet.
stacks (int): Number of stacks of WaveNet.
global_channels (int): Number of global channels.
dropout_rate (float): Dropout rate.
use_weight_norm (bool): Whether to use weight normalization in WaveNet.
bias (bool): Whether to use bias paramters in WaveNet.
use_only_mean (bool): Whether to estimate only mean.
in_channels (int):
Number of input channels.
hidden_channels (int):
Number of hidden channels.
kernel_size (int):
Kernel size for WaveNet.
base_dilation (int):
Base dilation factor for WaveNet.
layers (int):
Number of layers of WaveNet.
stacks (int):
Number of stacks of WaveNet.
global_channels (int):
Number of global channels.
dropout_rate (float):
Dropout rate.
use_weight_norm (bool):
Whether to use weight normalization in WaveNet.
bias (bool):
Whether to use bias paramters in WaveNet.
use_only_mean (bool):
Whether to estimate only mean.
"""
assert in_channels % 2 == 0, "in_channels should be divisible by 2"
......@@ -211,14 +238,20 @@ class ResidualAffineCouplingLayer(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T).
x_lengths (Tensor): Length tensor (B,).
g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
inverse (bool): Whether to inverse the flow.
x (Tensor):
Input tensor (B, in_channels, T).
x_lengths (Tensor):
Length tensor (B,).
g (Optional[Tensor]):
Global conditioning tensor (B, global_channels, 1).
inverse (bool):
Whether to inverse the flow.
Returns:
Tensor: Output tensor (B, in_channels, T).
Tensor: Log-determinant tensor for NLL (B,) if not inverse.
Tensor:
Output tensor (B, in_channels, T).
Tensor:
Log-determinant tensor for NLL (B,) if not inverse.
"""
xa, xb = paddle.split(x, 2, axis=1)
......
......@@ -62,23 +62,40 @@ class TextEncoder(nn.Layer):
"""Initialize TextEncoder module.
Args:
vocabs (int): Vocabulary size.
attention_dim (int): Attention dimension.
attention_heads (int): Number of attention heads.
linear_units (int): Number of linear units of positionwise layers.
blocks (int): Number of encoder blocks.
positionwise_layer_type (str): Positionwise layer type.
positionwise_conv_kernel_size (int): Positionwise layer's kernel size.
positional_encoding_layer_type (str): Positional encoding layer type.
self_attention_layer_type (str): Self-attention layer type.
activation_type (str): Activation function type.
normalize_before (bool): Whether to apply LayerNorm before attention.
use_macaron_style (bool): Whether to use macaron style components.
use_conformer_conv (bool): Whether to use conformer conv layers.
conformer_kernel_size (int): Conformer's conv kernel size.
dropout_rate (float): Dropout rate.
positional_dropout_rate (float): Dropout rate for positional encoding.
attention_dropout_rate (float): Dropout rate for attention.
vocabs (int):
Vocabulary size.
attention_dim (int):
Attention dimension.
attention_heads (int):
Number of attention heads.
linear_units (int):
Number of linear units of positionwise layers.
blocks (int):
Number of encoder blocks.
positionwise_layer_type (str):
Positionwise layer type.
positionwise_conv_kernel_size (int):
Positionwise layer's kernel size.
positional_encoding_layer_type (str):
Positional encoding layer type.
self_attention_layer_type (str):
Self-attention layer type.
activation_type (str):
Activation function type.
normalize_before (bool):
Whether to apply LayerNorm before attention.
use_macaron_style (bool):
Whether to use macaron style components.
use_conformer_conv (bool):
Whether to use conformer conv layers.
conformer_kernel_size (int):
Conformer's conv kernel size.
dropout_rate (float):
Dropout rate.
positional_dropout_rate (float):
Dropout rate for positional encoding.
attention_dropout_rate (float):
Dropout rate for attention.
"""
super().__init__()
......@@ -121,14 +138,20 @@ class TextEncoder(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input index tensor (B, T_text).
x_lengths (Tensor): Length tensor (B,).
x (Tensor):
Input index tensor (B, T_text).
x_lengths (Tensor):
Length tensor (B,).
Returns:
Tensor: Encoded hidden representation (B, attention_dim, T_text).
Tensor: Projected mean tensor (B, attention_dim, T_text).
Tensor: Projected scale tensor (B, attention_dim, T_text).
Tensor: Mask tensor for input tensor (B, 1, T_text).
Tensor:
Encoded hidden representation (B, attention_dim, T_text).
Tensor:
Projected mean tensor (B, attention_dim, T_text).
Tensor:
Projected scale tensor (B, attention_dim, T_text).
Tensor:
Mask tensor for input tensor (B, 1, T_text).
"""
x = self.emb(x) * math.sqrt(self.attention_dim)
......
......@@ -156,17 +156,25 @@ class VITS(nn.Layer):
init_type: str="xavier_uniform", ):
"""Initialize VITS module.
Args:
idim (int): Input vocabrary size.
odim (int): Acoustic feature dimension. The actual output channels will
idim (int):
Input vocabrary size.
odim (int):
Acoustic feature dimension. The actual output channels will
be 1 since VITS is the end-to-end text-to-wave model but for the
compatibility odim is used to indicate the acoustic feature dimension.
sampling_rate (int): Sampling rate, not used for the training but it will
sampling_rate (int):
Sampling rate, not used for the training but it will
be referred in saving waveform during the inference.
generator_type (str): Generator type.
generator_params (Dict[str, Any]): Parameter dict for generator.
discriminator_type (str): Discriminator type.
discriminator_params (Dict[str, Any]): Parameter dict for discriminator.
cache_generator_outputs (bool): Whether to cache generator outputs.
generator_type (str):
Generator type.
generator_params (Dict[str, Any]):
Parameter dict for generator.
discriminator_type (str):
Discriminator type.
discriminator_params (Dict[str, Any]):
Parameter dict for discriminator.
cache_generator_outputs (bool):
Whether to cache generator outputs.
"""
assert check_argument_types()
super().__init__()
......@@ -218,14 +226,22 @@ class VITS(nn.Layer):
forward_generator: bool=True, ) -> Dict[str, Any]:
"""Perform generator forward.
Args:
text (Tensor): Text index tensor (B, T_text).
text_lengths (Tensor): Text length tensor (B,).
feats (Tensor): Feature tensor (B, T_feats, aux_channels).
feats_lengths (Tensor): Feature length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
forward_generator (bool): Whether to forward generator.
text (Tensor):
Text index tensor (B, T_text).
text_lengths (Tensor):
Text length tensor (B,).
feats (Tensor):
Feature tensor (B, T_feats, aux_channels).
feats_lengths (Tensor):
Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
forward_generator (bool):
Whether to forward generator.
Returns:
"""
......@@ -259,13 +275,20 @@ class VITS(nn.Layer):
lids: Optional[paddle.Tensor]=None, ) -> Dict[str, Any]:
"""Perform generator forward.
Args:
text (Tensor): Text index tensor (B, T_text).
text_lengths (Tensor): Text length tensor (B,).
feats (Tensor): Feature tensor (B, T_feats, aux_channels).
feats_lengths (Tensor): Feature length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
text (Tensor):
Text index tensor (B, T_text).
text_lengths (Tensor):
Text length tensor (B,).
feats (Tensor):
Feature tensor (B, T_feats, aux_channels).
feats_lengths (Tensor):
Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
Returns:
"""
......@@ -304,13 +327,20 @@ class VITS(nn.Layer):
lids: Optional[paddle.Tensor]=None, ) -> Dict[str, Any]:
"""Perform discriminator forward.
Args:
text (Tensor): Text index tensor (B, T_text).
text_lengths (Tensor): Text length tensor (B,).
feats (Tensor): Feature tensor (B, T_feats, aux_channels).
feats_lengths (Tensor): Feature length tensor (B,).
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
text (Tensor):
Text index tensor (B, T_text).
text_lengths (Tensor):
Text length tensor (B,).
feats (Tensor):
Feature tensor (B, T_feats, aux_channels).
feats_lengths (Tensor):
Feature length tensor (B,).
sids (Optional[Tensor]):
Speaker index tensor (B,) or (B, 1).
spembs (Optional[Tensor]):
Speaker embedding tensor (B, spk_embed_dim).
lids (Optional[Tensor]):
Language index tensor (B,) or (B, 1).
Returns:
"""
......@@ -353,22 +383,36 @@ class VITS(nn.Layer):
use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
"""Run inference.
Args:
text (Tensor): Input text index tensor (T_text,).
feats (Tensor): Feature tensor (T_feats, aux_channels).
sids (Tensor): Speaker index tensor (1,).
spembs (Optional[Tensor]): Speaker embedding tensor (spk_embed_dim,).
lids (Tensor): Language index tensor (1,).
durations (Tensor): Ground-truth duration tensor (T_text,).
noise_scale (float): Noise scale value for flow.
noise_scale_dur (float): Noise scale value for duration predictor.
alpha (float): Alpha parameter to control the speed of generated speech.
max_len (Optional[int]): Maximum length.
use_teacher_forcing (bool): Whether to use teacher forcing.
text (Tensor):
Input text index tensor (T_text,).
feats (Tensor):
Feature tensor (T_feats, aux_channels).
sids (Tensor):
Speaker index tensor (1,).
spembs (Optional[Tensor]):
Speaker embedding tensor (spk_embed_dim,).
lids (Tensor):
Language index tensor (1,).
durations (Tensor):
Ground-truth duration tensor (T_text,).
noise_scale (float):
Noise scale value for flow.
noise_scale_dur (float):
Noise scale value for duration predictor.
alpha (float):
Alpha parameter to control the speed of generated speech.
max_len (Optional[int]):
Maximum length.
use_teacher_forcing (bool):
Whether to use teacher forcing.
Returns:
Dict[str, Tensor]:
* wav (Tensor): Generated waveform tensor (T_wav,).
* att_w (Tensor): Monotonic attention weight tensor (T_feats, T_text).
* duration (Tensor): Predicted duration tensor (T_text,).
* wav (Tensor):
Generated waveform tensor (T_wav,).
* att_w (Tensor):
Monotonic attention weight tensor (T_feats, T_text).
* duration (Tensor):
Predicted duration tensor (T_text,).
"""
# setup
text = text[None]
......@@ -417,15 +461,22 @@ class VITS(nn.Layer):
lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
"""Run voice conversion.
Args:
feats (Tensor): Feature tensor (T_feats, aux_channels).
sids_src (Optional[Tensor]): Speaker index tensor of source feature (1,).
sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (1,).
spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (spk_embed_dim,).
spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (spk_embed_dim,).
lids (Optional[Tensor]): Language index tensor (1,).
feats (Tensor):
Feature tensor (T_feats, aux_channels).
sids_src (Optional[Tensor]):
Speaker index tensor of source feature (1,).
sids_tgt (Optional[Tensor]):
Speaker index tensor of target feature (1,).
spembs_src (Optional[Tensor]):
Speaker embedding tensor of source feature (spk_embed_dim,).
spembs_tgt (Optional[Tensor]):
Speaker embedding tensor of target feature (spk_embed_dim,).
lids (Optional[Tensor]):
Language index tensor (1,).
Returns:
Dict[str, Tensor]:
* wav (Tensor): Generated waveform tensor (T_wav,).
* wav (Tensor):
Generated waveform tensor (T_wav,).
"""
assert feats is not None
feats = feats[None].transpose([0, 2, 1])
......
......@@ -39,14 +39,22 @@ class ResidualBlock(nn.Layer):
"""Initialize ResidualBlock module.
Args:
kernel_size (int): Kernel size of dilation convolution layer.
residual_channels (int): Number of channels for residual connection.
skip_channels (int): Number of channels for skip connection.
aux_channels (int): Number of local conditioning channels.
dropout (float): Dropout probability.
dilation (int): Dilation factor.
bias (bool): Whether to add bias parameter in convolution layers.
scale_residual (bool): Whether to scale the residual outputs.
kernel_size (int):
Kernel size of dilation convolution layer.
residual_channels (int):
Number of channels for residual connection.
skip_channels (int):
Number of channels for skip connection.
aux_channels (int):
Number of local conditioning channels.
dropout (float):
Dropout probability.
dilation (int):
Dilation factor.
bias (bool):
Whether to add bias parameter in convolution layers.
scale_residual (bool):
Whether to scale the residual outputs.
"""
super().__init__()
......
......@@ -47,25 +47,42 @@ class WaveNet(nn.Layer):
"""Initialize WaveNet module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
kernel_size (int): Kernel size of dilated convolution.
layers (int): Number of residual block layers.
stacks (int): Number of stacks i.e., dilation cycles.
base_dilation (int): Base dilation factor.
residual_channels (int): Number of channels in residual conv.
gate_channels (int): Number of channels in gated conv.
skip_channels (int): Number of channels in skip conv.
aux_channels (int): Number of channels for local conditioning feature.
global_channels (int): Number of channels for global conditioning feature.
dropout_rate (float): Dropout rate. 0.0 means no dropout applied.
bias (bool): Whether to use bias parameter in conv layer.
use_weight_norm (bool): Whether to use weight norm. If set to true, it will
be applied to all of the conv layers.
use_first_conv (bool): Whether to use the first conv layers.
use_last_conv (bool): Whether to use the last conv layers.
scale_residual (bool): Whether to scale the residual outputs.
scale_skip_connect (bool): Whether to scale the skip connection outputs.
in_channels (int):
Number of input channels.
out_channels (int):
Number of output channels.
kernel_size (int):
Kernel size of dilated convolution.
layers (int):
Number of residual block layers.
stacks (int):
Number of stacks i.e., dilation cycles.
base_dilation (int):
Base dilation factor.
residual_channels (int):
Number of channels in residual conv.
gate_channels (int):
Number of channels in gated conv.
skip_channels (int):
Number of channels in skip conv.
aux_channels (int):
Number of channels for local conditioning feature.
global_channels (int):
Number of channels for global conditioning feature.
dropout_rate (float):
Dropout rate. 0.0 means no dropout applied.
bias (bool):
Whether to use bias parameter in conv layer.
use_weight_norm (bool):
Whether to use weight norm. If set to true, it will be applied to all of the conv layers.
use_first_conv (bool):
Whether to use the first conv layers.
use_last_conv (bool):
Whether to use the last conv layers.
scale_residual (bool):
Whether to scale the residual outputs.
scale_skip_connect (bool):
Whether to scale the skip connection outputs.
"""
super().__init__()
......@@ -128,15 +145,18 @@ class WaveNet(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input noise signal (B, 1, T) if use_first_conv else
(B, residual_channels, T).
x_mask (Optional[Tensor]): Mask tensor (B, 1, T).
c (Optional[Tensor]): Local conditioning features (B, aux_channels, T).
g (Optional[Tensor]): Global conditioning features (B, global_channels, 1).
x (Tensor):
Input noise signal (B, 1, T) if use_first_conv else (B, residual_channels, T).
x_mask (Optional[Tensor]):
Mask tensor (B, 1, T).
c (Optional[Tensor]):
Local conditioning features (B, aux_channels, T).
g (Optional[Tensor]):
Global conditioning features (B, global_channels, 1).
Returns:
Tensor: Output tensor (B, out_channels, T) if use_last_conv else
(B, residual_channels, T).
Tensor:
Output tensor (B, out_channels, T) if use_last_conv else(B, residual_channels, T).
"""
# encode to hidden representation
......
......@@ -69,9 +69,11 @@ class MelResNet(nn.Layer):
def forward(self, x):
'''
Args:
x (Tensor): Input tensor (B, in_dims, T).
x (Tensor):
Input tensor (B, in_dims, T).
Returns:
Tensor: Output tensor (B, res_out_dims, T).
Tensor:
Output tensor (B, res_out_dims, T).
'''
x = self.conv_in(x)
......@@ -119,10 +121,13 @@ class UpsampleNetwork(nn.Layer):
def forward(self, m):
'''
Args:
c (Tensor): Input tensor (B, C_aux, T).
c (Tensor):
Input tensor (B, C_aux, T).
Returns:
Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux).
Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims).
Tensor:
Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux).
Tensor:
Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims).
'''
# aux: [B, C_aux, T]
# -> [B, res_out_dims, T - 2 * aux_context_window]
......@@ -302,7 +307,8 @@ class WaveRNN(nn.Layer):
number of samples for crossfading between batches
mu_law(bool)
Returns:
wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
wav sequence:
Output (T' * prod(upsample_scales), out_channels, C_out).
"""
self.eval()
......@@ -423,7 +429,7 @@ class WaveRNN(nn.Layer):
x(Tensor):
mel, [1, n_frames, 80]
pad(int):
side(str, optional): (Default value = 'both')
side(str, optional): (Default value = 'both')
Returns:
Tensor
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册