Merge branch 'tts-server3' of https://github.com/lym0302/PaddleSpeech into tts-server3

e354848c · lym0302 · 830e91ca · 20149e89 · e354848c · e354848c
2 changed file
--- a/speechserving/speechserving/conf/tts/tts_pd.yaml
+++ b/speechserving/speechserving/conf/tts/tts_pd.yaml
@@ -14,7 +14,7 @@ port: 8692
 am: 'fastspeech2_csmsc'   
 am_model: # the pdmodel file of am static model
 am_params: # the pdiparams file of am static model
-sample_rate: 24000
+am_sample_rate: 24000
 phones_dict: 
 tones_dict: 
 speaker_dict: 
@@ -33,6 +33,7 @@ am_predictor_conf:
 voc: 'pwgan_csmsc'
 voc_model: # the pdmodel file of vocoder static model
 voc_params: # the pdiparams file of vocoder static model 
+voc_sample_rate: 24000
 voc_predictor_conf:
  use_gpu: True

--- a/speechserving/speechserving/engine/tts/paddleinference/tts_engine.py
+++ b/speechserving/speechserving/engine/tts/paddleinference/tts_engine.py
@@ -83,6 +83,8 @@ pretrained_models = {
        'pwgan_csmsc.pdmodel',
        'params':
        'pwgan_csmsc.pdiparams',
+        'sample_rate':
+        24000,
    },
    # mb_melgan
    "mb_melgan_csmsc-zh": {
@@ -94,6 +96,8 @@ pretrained_models = {
        'mb_melgan_csmsc.pdmodel',
        'params':
        'mb_melgan_csmsc.pdiparams',
+        'sample_rate':
+        24000,
    },
    # hifigan
    "hifigan_csmsc-zh": {
@@ -105,6 +109,8 @@ pretrained_models = {
        'hifigan_csmsc.pdmodel',
        'params':
        'hifigan_csmsc.pdiparams',
+        'sample_rate':
+        24000,
    },
 }
@@ -141,13 +147,14 @@ class TTSServerExecutor(TTSExecutor):
            am: str='fastspeech2_csmsc',
            am_model: Optional[os.PathLike]=None,
            am_params: Optional[os.PathLike]=None,
-            sample_rate: int=24000,
+            am_sample_rate: int=24000,
            phones_dict: Optional[os.PathLike]=None,
            tones_dict: Optional[os.PathLike]=None,
            speaker_dict: Optional[os.PathLike]=None,
            voc: str='pwgan_csmsc',
            voc_model: Optional[os.PathLike]=None,
            voc_params: Optional[os.PathLike]=None,
+            voc_sample_rate: int=24000,
            lang: str='zh',
            am_predictor_conf: dict=None,
            voc_predictor_conf: dict=None, ):
@@ -169,7 +176,7 @@ class TTSServerExecutor(TTSExecutor):
            # must have phones_dict in acoustic
            self.phones_dict = os.path.join(
                am_res_path, pretrained_models[am_tag]['phones_dict'])
-            self.sample_rate = pretrained_models[am_tag]['sample_rate']
+            self.am_sample_rate = pretrained_models[am_tag]['sample_rate']
            logger.info(am_res_path)
            logger.info(self.am_model)
@@ -178,7 +185,7 @@ class TTSServerExecutor(TTSExecutor):
            self.am_model = os.path.abspath(am_model)
            self.am_params = os.path.abspath(am_params)
            self.phones_dict = os.path.abspath(phones_dict)
-            self.sample_rate = sample_rate
+            self.am_sample_rate = am_sample_rate
            self.am_res_path = os.path.dirname(os.path.abspath(self.am_model))
        print("self.phones_dict:", self.phones_dict)
@@ -207,14 +214,17 @@ class TTSServerExecutor(TTSExecutor):
                                          pretrained_models[voc_tag]['model'])
            self.voc_params = os.path.join(voc_res_path,
                                           pretrained_models[voc_tag]['params'])
+            self.voc_sample_rate = pretrained_models[voc_tag]['sample_rate']
            logger.info(voc_res_path)
            logger.info(self.voc_model)
            logger.info(self.voc_params)
        else:
            self.voc_model = os.path.abspath(voc_model)
            self.voc_params = os.path.abspath(voc_params)
+            self.voc_sample_rate = voc_sample_rate
            self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_model))
+        assert (self.voc_sample_rate == self.am_sample_rate)
        # Init body.
        with open(self.phones_dict, "r") as f:
            phn_id = [line.strip().split() for line in f.readlines()]
@@ -343,13 +353,14 @@ class TTSEngine(BaseEngine):
            am=self.conf_dict["am"],
            am_model=self.conf_dict["am_model"],
            am_params=self.conf_dict["am_params"],
-            sample_rate=self.conf_dict["sample_rate"],
+            am_sample_rate=self.conf_dict["am_sample_rate"],
            phones_dict=self.conf_dict["phones_dict"],
            tones_dict=self.conf_dict["tones_dict"],
            speaker_dict=self.conf_dict["speaker_dict"],
            voc=self.conf_dict["voc"],
            voc_model=self.conf_dict["voc_model"],
            voc_params=self.conf_dict["voc_params"],
+            voc_sample_rate=self.conf_dict["voc_sample_rate"],
            lang=self.conf_dict["lang"],
            am_predictor_conf=self.conf_dict["am_predictor_conf"],
            voc_predictor_conf=self.conf_dict["voc_predictor_conf"], )
@@ -451,7 +462,7 @@ class TTSEngine(BaseEngine):
        try:
            target_sample_rate, wav_base64 = self.postprocess(
                wav=self.executor._outputs['wav'].numpy(),
-                original_fs=self.executor.sample_rate,
+                original_fs=self.executor.am_sample_rate,
                target_fs=sample_rate,
                volume=volume,
                speed=speed,