Merge branch 'PaddlePaddle:develop' into develop

68bd85a6 · WongLaw · GitHub · fab5b3a3 · 5e714ecb · 68bd85a6
116 changed file
--- a/demos/speech_server/conf/application.yaml
+++ b/demos/speech_server/conf/application.yaml
@@ -61,7 +61,7 @@ tts_python:
    phones_dict: 
    tones_dict: 
    speaker_dict: 
-    spk_id: 0
    # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
    #                        'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc',
@@ -87,7 +87,7 @@ tts_inference:
    phones_dict: 
    tones_dict: 
    speaker_dict: 
-    spk_id: 0
    am_predictor_conf:
        device:  # set 'gpu:id' or 'cpu'

--- a/demos/streaming_tts_server/conf/tts_online_application.yaml
+++ b/demos/streaming_tts_server/conf/tts_online_application.yaml
@@ -29,7 +29,7 @@ tts_online:
    phones_dict: 
    tones_dict: 
    speaker_dict: 
-    spk_id: 0
    # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
    # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
@@ -70,7 +70,6 @@ tts_online-onnx:
    phones_dict: 
    tones_dict: 
    speaker_dict: 
-    spk_id: 0
    am_sample_rate: 24000
    am_sess_conf:
        device: "cpu" # set 'gpu:id' or 'cpu'

--- a/demos/streaming_tts_server/conf/tts_online_ws_application.yaml
+++ b/demos/streaming_tts_server/conf/tts_online_ws_application.yaml
@@ -29,7 +29,7 @@ tts_online:
    phones_dict: 
    tones_dict: 
    speaker_dict: 
-    spk_id: 0
    # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
    # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
@@ -70,7 +70,6 @@ tts_online-onnx:
    phones_dict: 
    tones_dict: 
    speaker_dict: 
-    spk_id: 0
    am_sample_rate: 24000
    am_sess_conf:
        device: "cpu" # set 'gpu:id' or 'cpu'

--- a/docs/source/api/paddlespeech.audio.rst
+++ b/docs/source/api/paddlespeech.audio.rst
@@ -20,4 +20,7 @@ Subpackages
   paddlespeech.audio.io
   paddlespeech.audio.metric
   paddlespeech.audio.sox_effects
+   paddlespeech.audio.streamdata
+   paddlespeech.audio.text
+   paddlespeech.audio.transform
   paddlespeech.audio.utils
--- a/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst
+paddlespeech.audio.streamdata.autodecode module
+===============================================
+.. automodule:: paddlespeech.audio.streamdata.autodecode
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.cache.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.cache.rst
+paddlespeech.audio.streamdata.cache module
+==========================================
+.. automodule:: paddlespeech.audio.streamdata.cache
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.compat.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.compat.rst
+paddlespeech.audio.streamdata.compat module
+===========================================
+.. automodule:: paddlespeech.audio.streamdata.compat
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst
+paddlespeech.audio.streamdata.extradatasets module
+==================================================
+.. automodule:: paddlespeech.audio.streamdata.extradatasets
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.filters.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.filters.rst
+paddlespeech.audio.streamdata.filters module
+============================================
+.. automodule:: paddlespeech.audio.streamdata.filters
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.gopen.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.gopen.rst
+paddlespeech.audio.streamdata.gopen module
+==========================================
+.. automodule:: paddlespeech.audio.streamdata.gopen
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.handlers.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.handlers.rst
+paddlespeech.audio.streamdata.handlers module
+=============================================
+.. automodule:: paddlespeech.audio.streamdata.handlers
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.mix.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.mix.rst
+paddlespeech.audio.streamdata.mix module
+========================================
+.. automodule:: paddlespeech.audio.streamdata.mix
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst
+paddlespeech.audio.streamdata.paddle\_utils module
+==================================================
+.. automodule:: paddlespeech.audio.streamdata.paddle_utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst
+paddlespeech.audio.streamdata.pipeline module
+=============================================
+.. automodule:: paddlespeech.audio.streamdata.pipeline
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.rst
+paddlespeech.audio.streamdata package
+=====================================
+.. automodule:: paddlespeech.audio.streamdata
+   :members:
+   :undoc-members:
+   :show-inheritance:
+Submodules
+----------
+.. toctree::
+   :maxdepth: 4
+   paddlespeech.audio.streamdata.autodecode
+   paddlespeech.audio.streamdata.cache
+   paddlespeech.audio.streamdata.compat
+   paddlespeech.audio.streamdata.extradatasets
+   paddlespeech.audio.streamdata.filters
+   paddlespeech.audio.streamdata.gopen
+   paddlespeech.audio.streamdata.handlers
+   paddlespeech.audio.streamdata.mix
+   paddlespeech.audio.streamdata.paddle_utils
+   paddlespeech.audio.streamdata.pipeline
+   paddlespeech.audio.streamdata.shardlists
+   paddlespeech.audio.streamdata.tariterators
+   paddlespeech.audio.streamdata.utils
+   paddlespeech.audio.streamdata.writer
--- a/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst
+paddlespeech.audio.streamdata.shardlists module
+===============================================
+.. automodule:: paddlespeech.audio.streamdata.shardlists
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst
+paddlespeech.audio.streamdata.tariterators module
+=================================================
+.. automodule:: paddlespeech.audio.streamdata.tariterators
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.utils.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.utils.rst
+paddlespeech.audio.streamdata.utils module
+==========================================
+.. automodule:: paddlespeech.audio.streamdata.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.streamdata.writer.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.writer.rst
+paddlespeech.audio.streamdata.writer module
+===========================================
+.. automodule:: paddlespeech.audio.streamdata.writer
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.text.rst
+++ b/docs/source/api/paddlespeech.audio.text.rst
+paddlespeech.audio.text package
+===============================
+.. automodule:: paddlespeech.audio.text
+   :members:
+   :undoc-members:
+   :show-inheritance:
+Submodules
+----------
+.. toctree::
+   :maxdepth: 4
+   paddlespeech.audio.text.text_featurizer
+   paddlespeech.audio.text.utility
--- a/docs/source/api/paddlespeech.audio.text.text_featurizer.rst
+++ b/docs/source/api/paddlespeech.audio.text.text_featurizer.rst
+paddlespeech.audio.text.text\_featurizer module
+===============================================
+.. automodule:: paddlespeech.audio.text.text_featurizer
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.text.utility.rst
+++ b/docs/source/api/paddlespeech.audio.text.utility.rst
+paddlespeech.audio.text.utility module
+======================================
+.. automodule:: paddlespeech.audio.text.utility
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.add_deltas.rst
+++ b/docs/source/api/paddlespeech.audio.transform.add_deltas.rst
+paddlespeech.audio.transform.add\_deltas module
+===============================================
+.. automodule:: paddlespeech.audio.transform.add_deltas
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.channel_selector.rst
+++ b/docs/source/api/paddlespeech.audio.transform.channel_selector.rst
+paddlespeech.audio.transform.channel\_selector module
+=====================================================
+.. automodule:: paddlespeech.audio.transform.channel_selector
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.cmvn.rst
+++ b/docs/source/api/paddlespeech.audio.transform.cmvn.rst
+paddlespeech.audio.transform.cmvn module
+========================================
+.. automodule:: paddlespeech.audio.transform.cmvn
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.functional.rst
+++ b/docs/source/api/paddlespeech.audio.transform.functional.rst
+paddlespeech.audio.transform.functional module
+==============================================
+.. automodule:: paddlespeech.audio.transform.functional
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.perturb.rst
+++ b/docs/source/api/paddlespeech.audio.transform.perturb.rst
+paddlespeech.audio.transform.perturb module
+===========================================
+.. automodule:: paddlespeech.audio.transform.perturb
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.rst
+++ b/docs/source/api/paddlespeech.audio.transform.rst
+paddlespeech.audio.transform package
+====================================
+.. automodule:: paddlespeech.audio.transform
+   :members:
+   :undoc-members:
+   :show-inheritance:
+Submodules
+----------
+.. toctree::
+   :maxdepth: 4
+   paddlespeech.audio.transform.add_deltas
+   paddlespeech.audio.transform.channel_selector
+   paddlespeech.audio.transform.cmvn
+   paddlespeech.audio.transform.functional
+   paddlespeech.audio.transform.perturb
+   paddlespeech.audio.transform.spec_augment
+   paddlespeech.audio.transform.spectrogram
+   paddlespeech.audio.transform.transform_interface
+   paddlespeech.audio.transform.transformation
+   paddlespeech.audio.transform.wpe
--- a/docs/source/api/paddlespeech.audio.transform.spec_augment.rst
+++ b/docs/source/api/paddlespeech.audio.transform.spec_augment.rst
+paddlespeech.audio.transform.spec\_augment module
+=================================================
+.. automodule:: paddlespeech.audio.transform.spec_augment
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.spectrogram.rst
+++ b/docs/source/api/paddlespeech.audio.transform.spectrogram.rst
+paddlespeech.audio.transform.spectrogram module
+===============================================
+.. automodule:: paddlespeech.audio.transform.spectrogram
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.transform_interface.rst
+++ b/docs/source/api/paddlespeech.audio.transform.transform_interface.rst
+paddlespeech.audio.transform.transform\_interface module
+========================================================
+.. automodule:: paddlespeech.audio.transform.transform_interface
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.transformation.rst
+++ b/docs/source/api/paddlespeech.audio.transform.transformation.rst
+paddlespeech.audio.transform.transformation module
+==================================================
+.. automodule:: paddlespeech.audio.transform.transformation
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.transform.wpe.rst
+++ b/docs/source/api/paddlespeech.audio.transform.wpe.rst
+paddlespeech.audio.transform.wpe module
+=======================================
+.. automodule:: paddlespeech.audio.transform.wpe
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst
+++ b/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst
+paddlespeech.audio.utils.check\_kwargs module
+=============================================
+.. automodule:: paddlespeech.audio.utils.check_kwargs
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst
+++ b/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst
+paddlespeech.audio.utils.dynamic\_import module
+===============================================
+.. automodule:: paddlespeech.audio.utils.dynamic_import
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.audio.utils.rst
+++ b/docs/source/api/paddlespeech.audio.utils.rst
@@ -12,8 +12,11 @@ Submodules
 .. toctree::
   :maxdepth: 4
+   paddlespeech.audio.utils.check_kwargs
   paddlespeech.audio.utils.download
+   paddlespeech.audio.utils.dynamic_import
   paddlespeech.audio.utils.error
   paddlespeech.audio.utils.log
   paddlespeech.audio.utils.numeric
+   paddlespeech.audio.utils.tensor_utils
   paddlespeech.audio.utils.time
--- a/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst
+++ b/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst
+paddlespeech.audio.utils.tensor\_utils module
+=============================================
+.. automodule:: paddlespeech.audio.utils.tensor_utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst
+paddlespeech.kws.exps.mdtc.collate module
+=========================================
+.. automodule:: paddlespeech.kws.exps.mdtc.collate
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst
+paddlespeech.kws.exps.mdtc.compute\_det module
+==============================================
+.. automodule:: paddlespeech.kws.exps.mdtc.compute_det
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
+paddlespeech.kws.exps.mdtc.plot\_det\_curve module
+==================================================
+.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.rst
+paddlespeech.kws.exps.mdtc package
+==================================
+.. automodule:: paddlespeech.kws.exps.mdtc
+   :members:
+   :undoc-members:
+   :show-inheritance:
+Submodules
+----------
+.. toctree::
+   :maxdepth: 4
+   paddlespeech.kws.exps.mdtc.collate
+   paddlespeech.kws.exps.mdtc.compute_det
+   paddlespeech.kws.exps.mdtc.plot_det_curve
+   paddlespeech.kws.exps.mdtc.score
+   paddlespeech.kws.exps.mdtc.train
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst
+paddlespeech.kws.exps.mdtc.score module
+=======================================
+.. automodule:: paddlespeech.kws.exps.mdtc.score
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst
+paddlespeech.kws.exps.mdtc.train module
+=======================================
+.. automodule:: paddlespeech.kws.exps.mdtc.train
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.kws.exps.rst
+++ b/docs/source/api/paddlespeech.kws.exps.rst
+paddlespeech.kws.exps package
+=============================
+.. automodule:: paddlespeech.kws.exps
+   :members:
+   :undoc-members:
+   :show-inheritance:
+Subpackages
+-----------
+.. toctree::
+   :maxdepth: 4
+   paddlespeech.kws.exps.mdtc
--- a/docs/source/api/paddlespeech.kws.rst
+++ b/docs/source/api/paddlespeech.kws.rst
@@ -12,4 +12,5 @@ Subpackages
 .. toctree::
   :maxdepth: 4
+   paddlespeech.kws.exps
   paddlespeech.kws.models
--- a/docs/source/api/paddlespeech.resource.model_alias.rst
+++ b/docs/source/api/paddlespeech.resource.model_alias.rst
+paddlespeech.resource.model\_alias module
+=========================================
+.. automodule:: paddlespeech.resource.model_alias
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.resource.pretrained_models.rst
+++ b/docs/source/api/paddlespeech.resource.pretrained_models.rst
+paddlespeech.resource.pretrained\_models module
+===============================================
+.. automodule:: paddlespeech.resource.pretrained_models
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.resource.resource.rst
+++ b/docs/source/api/paddlespeech.resource.resource.rst
+paddlespeech.resource.resource module
+=====================================
+.. automodule:: paddlespeech.resource.resource
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.resource.rst
+++ b/docs/source/api/paddlespeech.resource.rst
+paddlespeech.resource package
+=============================
+.. automodule:: paddlespeech.resource
+   :members:
+   :undoc-members:
+   :show-inheritance:
+Submodules
+----------
+.. toctree::
+   :maxdepth: 4
+   paddlespeech.resource.model_alias
+   paddlespeech.resource.pretrained_models
+   paddlespeech.resource.resource
--- a/docs/source/api/paddlespeech.rst
+++ b/docs/source/api/paddlespeech.rst
@@ -16,8 +16,10 @@ Subpackages
   paddlespeech.cli
   paddlespeech.cls
   paddlespeech.kws
+   paddlespeech.resource
   paddlespeech.s2t
   paddlespeech.server
   paddlespeech.t2s
   paddlespeech.text
+   paddlespeech.utils
   paddlespeech.vector
--- a/docs/source/api/paddlespeech.s2t.rst
+++ b/docs/source/api/paddlespeech.s2t.rst
@@ -19,5 +19,4 @@ Subpackages
   paddlespeech.s2t.models
   paddlespeech.s2t.modules
   paddlespeech.s2t.training
-   paddlespeech.s2t.transform
   paddlespeech.s2t.utils
--- a/docs/source/api/paddlespeech.server.utils.rst
+++ b/docs/source/api/paddlespeech.server.utils.rst
@@ -18,7 +18,6 @@ Submodules
   paddlespeech.server.utils.config
   paddlespeech.server.utils.errors
   paddlespeech.server.utils.exception
-   paddlespeech.server.utils.log
   paddlespeech.server.utils.onnx_infer
   paddlespeech.server.utils.paddle_predictor
   paddlespeech.server.utils.util

--- a/docs/source/api/paddlespeech.t2s.datasets.rst
+++ b/docs/source/api/paddlespeech.t2s.datasets.rst
@@ -19,4 +19,5 @@ Submodules
   paddlespeech.t2s.datasets.get_feats
   paddlespeech.t2s.datasets.ljspeech
   paddlespeech.t2s.datasets.preprocess_utils
+   paddlespeech.t2s.datasets.sampler
   paddlespeech.t2s.datasets.vocoder_batch_fn
--- a/docs/source/api/paddlespeech.t2s.datasets.sampler.rst
+++ b/docs/source/api/paddlespeech.t2s.datasets.sampler.rst
+paddlespeech.t2s.datasets.sampler module
+========================================
+.. automodule:: paddlespeech.t2s.datasets.sampler
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst
+paddlespeech.t2s.exps.ernie\_sat.align module
+=============================================
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.align
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst
+paddlespeech.t2s.exps.ernie\_sat.normalize module
+=================================================
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.normalize
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst
+paddlespeech.t2s.exps.ernie\_sat.preprocess module
+==================================================
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.preprocess
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst
+paddlespeech.t2s.exps.ernie\_sat package
+========================================
+.. automodule:: paddlespeech.t2s.exps.ernie_sat
+   :members:
+   :undoc-members:
+   :show-inheritance:
+Submodules
+----------
+.. toctree::
+   :maxdepth: 4
+   paddlespeech.t2s.exps.ernie_sat.align
+   paddlespeech.t2s.exps.ernie_sat.normalize
+   paddlespeech.t2s.exps.ernie_sat.preprocess
+   paddlespeech.t2s.exps.ernie_sat.synthesize
+   paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
+   paddlespeech.t2s.exps.ernie_sat.train
+   paddlespeech.t2s.exps.ernie_sat.utils
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst
+paddlespeech.t2s.exps.ernie\_sat.synthesize module
+==================================================
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst
+paddlespeech.t2s.exps.ernie\_sat.synthesize\_e2e module
+=======================================================
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst
+paddlespeech.t2s.exps.ernie\_sat.train module
+=============================================
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.train
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst
+paddlespeech.t2s.exps.ernie\_sat.utils module
+=============================================
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst
@@ -16,3 +16,4 @@ Submodules
   paddlespeech.t2s.exps.fastspeech2.normalize
   paddlespeech.t2s.exps.fastspeech2.preprocess
   paddlespeech.t2s.exps.fastspeech2.train
+   paddlespeech.t2s.exps.fastspeech2.vc2_infer
--- a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst
+paddlespeech.t2s.exps.fastspeech2.vc2\_infer module
+===================================================
+.. automodule:: paddlespeech.t2s.exps.fastspeech2.vc2_infer
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.rst
@@ -12,11 +12,13 @@ Subpackages
 .. toctree::
   :maxdepth: 4
+   paddlespeech.t2s.exps.ernie_sat
   paddlespeech.t2s.exps.fastspeech2
   paddlespeech.t2s.exps.gan_vocoder
   paddlespeech.t2s.exps.speedyspeech
   paddlespeech.t2s.exps.tacotron2
   paddlespeech.t2s.exps.transformer_tts
+   paddlespeech.t2s.exps.vits
   paddlespeech.t2s.exps.waveflow
   paddlespeech.t2s.exps.wavernn
@@ -31,6 +33,7 @@ Submodules
   paddlespeech.t2s.exps.ort_predict
   paddlespeech.t2s.exps.ort_predict_e2e
   paddlespeech.t2s.exps.ort_predict_streaming
+   paddlespeech.t2s.exps.stream_play_tts
   paddlespeech.t2s.exps.syn_utils
   paddlespeech.t2s.exps.synthesize
   paddlespeech.t2s.exps.synthesize_e2e

--- a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
+paddlespeech.t2s.exps.stream\_play\_tts module
+==============================================
+.. automodule:: paddlespeech.t2s.exps.stream_play_tts
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst
+paddlespeech.t2s.exps.vits.normalize module
+===========================================
+.. automodule:: paddlespeech.t2s.exps.vits.normalize
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst
+paddlespeech.t2s.exps.vits.preprocess module
+============================================
+.. automodule:: paddlespeech.t2s.exps.vits.preprocess
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.vits.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.rst
+paddlespeech.t2s.exps.vits package
+==================================
+.. automodule:: paddlespeech.t2s.exps.vits
+   :members:
+   :undoc-members:
+   :show-inheritance:
+Submodules
+----------
+.. toctree::
+   :maxdepth: 4
+   paddlespeech.t2s.exps.vits.normalize
+   paddlespeech.t2s.exps.vits.preprocess
+   paddlespeech.t2s.exps.vits.synthesize
+   paddlespeech.t2s.exps.vits.synthesize_e2e
+   paddlespeech.t2s.exps.vits.train
+   paddlespeech.t2s.exps.vits.voice_cloning
--- a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst
+paddlespeech.t2s.exps.vits.synthesize module
+============================================
+.. automodule:: paddlespeech.t2s.exps.vits.synthesize
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst
+paddlespeech.t2s.exps.vits.synthesize\_e2e module
+=================================================
+.. automodule:: paddlespeech.t2s.exps.vits.synthesize_e2e
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.vits.train.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.train.rst
+paddlespeech.t2s.exps.vits.train module
+=======================================
+.. automodule:: paddlespeech.t2s.exps.vits.train
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst
+paddlespeech.t2s.exps.vits.voice\_cloning module
+================================================
+.. automodule:: paddlespeech.t2s.exps.vits.voice_cloning
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst
+paddlespeech.t2s.frontend.g2pw.dataset module
+=============================================
+.. automodule:: paddlespeech.t2s.frontend.g2pw.dataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst
+paddlespeech.t2s.frontend.g2pw.onnx\_api module
+===============================================
+.. automodule:: paddlespeech.t2s.frontend.g2pw.onnx_api
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst
+paddlespeech.t2s.frontend.g2pw package
+======================================
+.. automodule:: paddlespeech.t2s.frontend.g2pw
+   :members:
+   :undoc-members:
+   :show-inheritance:
+Submodules
+----------
+.. toctree::
+   :maxdepth: 4
+   paddlespeech.t2s.frontend.g2pw.dataset
+   paddlespeech.t2s.frontend.g2pw.onnx_api
+   paddlespeech.t2s.frontend.g2pw.utils
--- a/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst
+paddlespeech.t2s.frontend.g2pw.utils module
+===========================================
+.. automodule:: paddlespeech.t2s.frontend.g2pw.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst
+paddlespeech.t2s.frontend.mix\_frontend module
+==============================================
+.. automodule:: paddlespeech.t2s.frontend.mix_frontend
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.frontend.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.rst
@@ -12,6 +12,7 @@ Subpackages
 .. toctree::
   :maxdepth: 4
+   paddlespeech.t2s.frontend.g2pw
   paddlespeech.t2s.frontend.normalizer
   paddlespeech.t2s.frontend.zh_normalization
@@ -23,6 +24,7 @@ Submodules
   paddlespeech.t2s.frontend.arpabet
   paddlespeech.t2s.frontend.generate_lexicon
+   paddlespeech.t2s.frontend.mix_frontend
   paddlespeech.t2s.frontend.phonectic
   paddlespeech.t2s.frontend.punctuation
   paddlespeech.t2s.frontend.tone_sandhi

--- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst
+++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst
+paddlespeech.t2s.models.ernie\_sat.ernie\_sat module
+====================================================
+.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst
+++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst
+paddlespeech.t2s.models.ernie\_sat.ernie\_sat\_updater module
+=============================================================
+.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst
+++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst
@@ -12,4 +12,5 @@ Submodules
 .. toctree::
   :maxdepth: 4
-   paddlespeech.t2s.models.ernie_sat.mlm
+   paddlespeech.t2s.models.ernie_sat.ernie_sat
+   paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
--- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
+paddlespeech.t2s.models.vits.monotonic\_align.core module
+=========================================================
+.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
+paddlespeech.t2s.models.vits.monotonic\_align package
+=====================================================
+.. automodule:: paddlespeech.t2s.models.vits.monotonic_align
+   :members:
+   :undoc-members:
+   :show-inheritance:
+Submodules
+----------
+.. toctree::
+   :maxdepth: 4
+   paddlespeech.t2s.models.vits.monotonic_align.core
+   paddlespeech.t2s.models.vits.monotonic_align.setup
--- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
+paddlespeech.t2s.models.vits.monotonic\_align.setup module
+==========================================================
+.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.utils.dynamic_import.rst
+++ b/docs/source/api/paddlespeech.utils.dynamic_import.rst
+paddlespeech.utils.dynamic\_import module
+=========================================
+.. automodule:: paddlespeech.utils.dynamic_import
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.utils.env.rst
+++ b/docs/source/api/paddlespeech.utils.env.rst
+paddlespeech.utils.env module
+=============================
+.. automodule:: paddlespeech.utils.env
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/source/api/paddlespeech.utils.rst
+++ b/docs/source/api/paddlespeech.utils.rst
+paddlespeech.utils package
+==========================
+.. automodule:: paddlespeech.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+Submodules
+----------
+.. toctree::
+   :maxdepth: 4
+   paddlespeech.utils.dynamic_import
+   paddlespeech.utils.env
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -74,8 +74,10 @@ Contents
   paddlespeech.cli <api/paddlespeech.cli>
   paddlespeech.cls <api/paddlespeech.cls>
   paddlespeech.kws <api/paddlespeech.kws>
+   paddlespeech.resource <api/paddlespeech.resource>
   paddlespeech.s2t <api/paddlespeech.s2t>
   paddlespeech.server <api/paddlespeech.server>
   paddlespeech.t2s <api/paddlespeech.t2s>
   paddlespeech.text <api/paddlespeech.text>
+   paddlespeech.utils <api/ppaddlespeech.utils>
   paddlespeech.vector <api/paddlespeech.vector>
--- a/docs/source/tts/tts_papers.md
+++ b/docs/source/tts/tts_papers.md
@@ -5,6 +5,7 @@
 - [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf)
 - [Polyphone Disambiguation in Mandarin Chinese with Semi-Supervised Learning](https://www.isca-speech.org/archive/pdfs/interspeech_2021/shi21d_interspeech.pdf)
  * github: https://github.com/PaperMechanica/SemiPPL
+- [WikipediaHomographData](https://github.com/google-research-datasets/WikipediaHomographData)
 ### Text Normalization
 #### English
 - [applenob/text_normalization](https://github.com/applenob/text_normalization)

--- a/examples/aishell3/ernie_sat/README.md
+++ b/examples/aishell3/ernie_sat/README.md
-# ERNIE-SAT with AISHELL3 dataset
+# ERNIE-SAT with VCTK dataset
+ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
-ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型，其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景，该项目供研究使用。
+## Model Framework
+In ERNIE-SAT, we propose two innovations:
-## 模型框架
+- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
-ERNIE-SAT 中我们提出了两项创新：
+- The joint mask learning of speech and text is used to realize the alignment of speech and text
- 在预训练过程中将中英双语对应的音素作为输入，实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
 <p align="center">
    <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />

--- a/examples/aishell3_vctk/ernie_sat/README.md
+++ b/examples/aishell3_vctk/ernie_sat/README.md
-# ERNIE-SAT with AISHELL3 and VCTK dataset
+# ERNIE-SAT with VCTK dataset
+ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
-ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型，其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景，该项目供研究使用。
+## Model Framework
+In ERNIE-SAT, we propose two innovations:
-## 模型框架
+- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
-ERNIE-SAT 中我们提出了两项创新：
+- The joint mask learning of speech and text is used to realize the alignment of speech and text
- 在预训练过程中将中英双语对应的音素作为输入，实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
 <p align="center">
    <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />

--- a/examples/other/tts_finetune/tts3/README.md
+++ b/examples/other/tts_finetune/tts3/README.md
@@ -75,6 +75,15 @@ When "Prepare" done. The structure of the current directory is listed below.
 ```
+### Set finetune.yaml
+`finetune.yaml` contains some configurations for fine-tuning. You can try various options to fine better result.
+Arguments:
+  - `batch_size`: finetune batch size. Default: -1, means 64 which same to pretrained model
+  - `learning_rate`: learning rate. Default: 0.0001
+  - `num_snapshots`: number of save models. Default: -1, means 5 which same to pretrained model
+  - `frozen_layers`: frozen layers. must be a list. If you don't want to frozen any layer, set []. 
 ## Get Started
 Run the command below to

--- a/examples/other/tts_finetune/tts3/finetune.py
+++ b/examples/other/tts_finetune/tts3/finetune.py
@@ -14,6 +14,7 @@
 import argparse
 import os
 from pathlib import Path
+from typing import List
 from typing import Union
 import yaml
@@ -21,10 +22,10 @@ from local.check_oov import get_check_result
 from local.extract import extract_feature
 from local.label_process import get_single_label
 from local.prepare_env import generate_finetune_env
+from local.train import train_sp
 from paddle import distributed as dist
 from yacs.config import CfgNode
-from paddlespeech.t2s.exps.fastspeech2.train import train_sp
 from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
 DICT_EN = 'tools/aligner/cmudict-0.7b'
@@ -38,15 +39,24 @@ os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
 class TrainArgs():
-    def __init__(self, ngpu, config_file, dump_dir: Path, output_dir: Path):
+    def __init__(self,
+                 ngpu,
+                 config_file,
+                 dump_dir: Path,
+                 output_dir: Path,
+                 frozen_layers: List[str]):
+        # config: fastspeech2 config file.
        self.config = str(config_file)
        self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl")
        self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl")
+        # model output dir.
        self.output_dir = str(output_dir)
        self.ngpu = ngpu
        self.phones_dict = str(dump_dir / "phone_id_map.txt")
        self.speaker_dict = str(dump_dir / "speaker_id_map.txt")
        self.voice_cloning = False
+        # frozen layers
+        self.frozen_layers = frozen_layers
 def get_mfa_result(
@@ -122,12 +132,11 @@ if __name__ == '__main__':
        "--ngpu", type=int, default=2, help="if ngpu=0, use cpu.")
    parser.add_argument("--epoch", type=int, default=100, help="finetune epoch")
    parser.add_argument(
-        "--batch_size",
+        "--finetune_config",
-        type=int,
+        type=str,
-        default=-1,
+        default="./finetune.yaml",
-        help="batch size, default -1 means same as pretrained model")
+        help="Path to finetune config file")
    args = parser.parse_args()
@@ -147,8 +156,14 @@ if __name__ == '__main__':
    with open(config_file) as f:
        config = CfgNode(yaml.safe_load(f))
    config.max_epoch = config.max_epoch + args.epoch
-    if args.batch_size > 0:
-        config.batch_size = args.batch_size
+    with open(args.finetune_config) as f2:
+        finetune_config = CfgNode(yaml.safe_load(f2))
+    config.batch_size = finetune_config.batch_size if finetune_config.batch_size > 0 else config.batch_size
+    config.optimizer.learning_rate = finetune_config.learning_rate if finetune_config.learning_rate > 0 else config.optimizer.learning_rate
+    config.num_snapshots = finetune_config.num_snapshots if finetune_config.num_snapshots > 0 else config.num_snapshots
+    frozen_layers = finetune_config.frozen_layers
+    assert type(frozen_layers) == list, "frozen_layers should be set a list."
    if args.lang == 'en':
        lexicon_file = DICT_EN
@@ -158,6 +173,13 @@ if __name__ == '__main__':
        mfa_phone_file = MFA_PHONE_ZH
    else:
        print('please input right lang!!')
+    print(f"finetune max_epoch: {config.max_epoch}")
+    print(f"finetune batch_size: {config.batch_size}")
+    print(f"finetune learning_rate: {config.optimizer.learning_rate}")
+    print(f"finetune num_snapshots: {config.num_snapshots}")
+    print(f"finetune frozen_layers: {frozen_layers}")
    am_phone_file = pretrained_model_dir / "phone_id_map.txt"
    label_file = input_dir / "labels.txt"
@@ -181,7 +203,8 @@ if __name__ == '__main__':
    generate_finetune_env(output_dir, pretrained_model_dir)
    # create a new args for training
-    train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir)
+    train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir,
+                           frozen_layers)
    # finetune models
    # dispatch

--- a/examples/other/tts_finetune/tts3/finetune.yaml
+++ b/examples/other/tts_finetune/tts3/finetune.yaml
+###########################################################
+#                 PARAS SETTING               #
+###########################################################
+# Set to -1 to indicate that the parameter is the same as the pretrained model configuration
+batch_size: -1
+learning_rate: 0.0001     # learning rate
+num_snapshots: -1
+# frozen_layers should be a list
+# if you don't need to freeze, set frozen_layers to []
+frozen_layers: ["encoder", "duration_predictor"]
--- a/examples/other/tts_finetune/tts3/local/extract.py
+++ b/examples/other/tts_finetune/tts3/local/extract.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import math
 import os
 from operator import itemgetter
 from pathlib import Path
@@ -211,9 +210,9 @@ def extract_feature(duration_file: str,
    mel_extractor, pitch_extractor, energy_extractor = get_extractor(config)
    wav_files = sorted(list((input_dir).rglob("*.wav")))
-    # split data into 3 sections, train: 80%, dev: 10%, test: 10%
+    # split data into 3 sections, train: len(wav_files) - 2, dev: 1, test: 1
-    num_train = math.ceil(len(wav_files) * 0.8)
+    num_train = len(wav_files) - 2
-    num_dev = math.ceil(len(wav_files) * 0.1)
+    num_dev = 1
    print(num_train, num_dev)
    train_wav_files = wav_files[:num_train]

--- a/examples/other/tts_finetune/tts3/local/train.py
+++ b/examples/other/tts_finetune/tts3/local/train.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+import shutil
+from pathlib import Path
+from typing import List
+import jsonlines
+import numpy as np
+import paddle
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn
+from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Evaluator
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Updater
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.optimizer import build_optimizers
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+def freeze_layer(model, layers: List[str]):
+    """freeze layers
+    Args:
+        layers (List[str]): frozen layers
+    """
+    for layer in layers:
+        for param in eval("model." + layer + ".parameters()"):
+            param.trainable = False
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+    world_size = paddle.distributed.get_world_size()
+    if world_size > 1:
+        paddle.distributed.init_parallel_env()
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+    fields = [
+        "text", "text_lengths", "speech", "speech_lengths", "durations",
+        "pitch", "energy"
+    ]
+    converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
+    spk_num = None
+    if args.speaker_dict is not None:
+        print("multiple speaker fastspeech2!")
+        collate_fn = fastspeech2_multi_spk_batch_fn
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        fields += ["spk_id"]
+    elif args.voice_cloning:
+        print("Training voice cloning!")
+        collate_fn = fastspeech2_multi_spk_batch_fn
+        fields += ["spk_emb"]
+        converters["spk_emb"] = np.load
+    else:
+        print("single speaker fastspeech2!")
+        collate_fn = fastspeech2_single_spk_batch_fn
+    print("spk_num:", spk_num)
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=fields,
+        converters=converters, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=fields,
+        converters=converters, )
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+    print("samplers done!")
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        shuffle=False,
+        drop_last=False,
+        batch_size=config.batch_size,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+    odim = config.n_mels
+    model = FastSpeech2(
+        idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
+    # freeze layer
+    if args.frozen_layers != []:
+        freeze_layer(model, args.frozen_layers)
+    if world_size > 1:
+        model = DataParallel(model)
+    print("model done!")
+    optimizer = build_optimizers(model, **config["optimizer"])
+    print("optimizer done!")
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+    updater = FastSpeech2Updater(
+        model=model,
+        optimizer=optimizer,
+        dataloader=train_dataloader,
+        output_dir=output_dir,
+        **config["updater"])
+    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+    evaluator = FastSpeech2Evaluator(
+        model, dev_dataloader, output_dir=output_dir, **config["updater"])
+    if dist.get_rank() == 0:
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+        trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+    trainer.run()
--- a/examples/other/tts_finetune/tts3/run.sh
+++ b/examples/other/tts_finetune/tts3/run.sh
@@ -10,11 +10,12 @@ mfa_dir=./mfa_result
 dump_dir=./dump
 output_dir=./exp/default
 lang=zh
-ngpu=2
+ngpu=1
+finetune_config=./finetune.yaml
-ckpt=snapshot_iter_96600
+ckpt=snapshot_iter_96699
-gpus=0,1
+gpus=1
 CUDA_VISIBLE_DEVICES=${gpus}
 stage=0
 stop_stage=100
@@ -35,7 +36,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --output_dir=${output_dir} \
        --lang=${lang} \
        --ngpu=${ngpu} \
-        --epoch=100
+        --epoch=100 \
+        --finetune_config=${finetune_config}
 fi
@@ -54,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
        --text=${BIN_DIR}/../sentences.txt \
-        --output_dir=./test_e2e \
+        --output_dir=./test_e2e/ \
        --phones_dict=${dump_dir}/phone_id_map.txt \
        --speaker_dict=${dump_dir}/speaker_id_map.txt \
        --spk_id=0 

--- a/examples/vctk/ernie_sat/README.md
+++ b/examples/vctk/ernie_sat/README.md
 # ERNIE-SAT with VCTK dataset
+ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
-ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型，其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景，该项目供研究使用。
+## Model Framework
+In ERNIE-SAT, we propose two innovations:
-## 模型框架
+- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
-ERNIE-SAT 中我们提出了两项创新：
+- The joint mask learning of speech and text is used to realize the alignment of speech and text
- 在预训练过程中将中英双语对应的音素作为输入，实现了跨语言、个性化的软音素映射
- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
 <p align="center">
    <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />

--- a/examples/wenetspeech/asr1/RESULTS.md
+++ b/examples/wenetspeech/asr1/RESULTS.md
@@ -46,3 +46,10 @@ Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_greedy_search | 16 | 0.078918 |  
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 |  
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention_rescoring | 16 | 0.054401 |
+| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention | -1 | 0.050767 |  
+| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_greedy_search | -1 | 0.061884 |  
+| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_prefix_beam_search | -1 | 0.062056 |  
+| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention_rescoring | -1 |  0.052110 |
--- a/paddlespeech/server/engine/engine_warmup.py
+++ b/paddlespeech/server/engine/engine_warmup.py
--- a/paddlespeech/t2s/frontend/g2pw/__init__.py
+++ b/paddlespeech/t2s/frontend/g2pw/__init__.py
--- a/paddlespeech/t2s/frontend/g2pw/dataset.py
+++ b/paddlespeech/t2s/frontend/g2pw/dataset.py
--- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py
+++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
--- a/paddlespeech/t2s/frontend/g2pw/utils.py
+++ b/paddlespeech/t2s/frontend/g2pw/utils.py
--- a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
+++ b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
--- a/paddlespeech/t2s/models/vits/duration_predictor.py
+++ b/paddlespeech/t2s/models/vits/duration_predictor.py
--- a/paddlespeech/t2s/models/vits/flow.py
+++ b/paddlespeech/t2s/models/vits/flow.py
--- a/paddlespeech/t2s/models/vits/generator.py
+++ b/paddlespeech/t2s/models/vits/generator.py
--- a/paddlespeech/t2s/models/vits/posterior_encoder.py
+++ b/paddlespeech/t2s/models/vits/posterior_encoder.py
--- a/paddlespeech/t2s/models/vits/residual_coupling.py
+++ b/paddlespeech/t2s/models/vits/residual_coupling.py
--- a/paddlespeech/t2s/models/vits/text_encoder.py
+++ b/paddlespeech/t2s/models/vits/text_encoder.py
--- a/paddlespeech/t2s/models/vits/vits.py
+++ b/paddlespeech/t2s/models/vits/vits.py
--- a/paddlespeech/t2s/models/vits/wavenet/residual_block.py
+++ b/paddlespeech/t2s/models/vits/wavenet/residual_block.py
--- a/paddlespeech/t2s/models/vits/wavenet/wavenet.py
+++ b/paddlespeech/t2s/models/vits/wavenet/wavenet.py
--- a/paddlespeech/t2s/models/wavernn/wavernn.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn.py