Merge pull request #1446 from lym0302/tts-server3

[server] add params type

Merge pull request #1446 from lym0302/tts-server3
[server] add params type
79c064fe · Hui Zhang · GitHub · 12195378 · 37d9dc5a · 79c064fe
6 changed file
--- a/speechserving/speechserving/conf/tts/tts.yaml
+++ b/speechserving/speechserving/conf/tts/tts.yaml
 # This is the parameter configuration file for TTS server.
-##################################################################
-#                     TTS SERVER SETTING                         #
-##################################################################
-host: '0.0.0.0'
-port: 8692
 ##################################################################
 #                  ACOUSTIC MODEL SETTING                        #
 # am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',

--- a/speechserving/speechserving/conf/tts/tts_pd.yaml
+++ b/speechserving/speechserving/conf/tts/tts_pd.yaml
 # This is the parameter configuration file for TTS server.
 # These are the static models that support paddle inference.
-##################################################################
-#                     TTS SERVER SETTING                         #
-##################################################################
-host: '0.0.0.0'
-port: 8692
 ##################################################################
 #                  ACOUSTIC MODEL SETTING                        #
 # am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']

--- a/speechserving/speechserving/engine/tts/paddleinference/tts_engine.py
+++ b/speechserving/speechserving/engine/tts/paddleinference/tts_engine.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
 import base64
 import io
 import os
@@ -21,7 +20,6 @@ import librosa
 import numpy as np
 import paddle
 import soundfile as sf
-import yaml
 from engine.base_engine import BaseEngine
 from scipy.io import wavfile
@@ -32,6 +30,7 @@ from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from utils.audio_process import change_speed
+from utils.config import get_config
 from utils.errors import ErrorCode
 from utils.exception import ServerBaseException
 from utils.paddle_predictor import init_predictor
@@ -118,14 +117,7 @@ pretrained_models = {
 class TTSServerExecutor(TTSExecutor):
    def __init__(self):
        super().__init__()
+        pass
-        self.parser = argparse.ArgumentParser(
-            prog='paddlespeech.tts', add_help=True)
-        self.parser.add_argument(
-            '--conf',
-            type=str,
-            default='./conf/tts/tts_pd.yaml',
-            help='Configuration parameters.')
    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        """
@@ -224,7 +216,10 @@ class TTSServerExecutor(TTSExecutor):
            self.voc_sample_rate = voc_sample_rate
            self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_model))
-        assert (self.voc_sample_rate == self.am_sample_rate)
+        assert (
+            self.voc_sample_rate == self.am_sample_rate
+        ), "The sample rate of AM and Vocoder model are different, please check model."
        # Init body.
        with open(self.phones_dict, "r") as f:
            phn_id = [line.strip().split() for line in f.readlines()]
@@ -339,31 +334,31 @@ class TTSEngine(BaseEngine):
        metaclass: Defaults to Singleton.
    """
-    def __init__(self, name=None):
+    def __init__(self):
        """Initialize TTS server engine
        """
        super(TTSEngine, self).__init__()
-        self.executor = TTSServerExecutor()
-        config_path = self.executor.parser.parse_args().conf
+    def init(self, config_file: str):
-        with open(config_path, 'rt') as f:
+        self.executor = TTSServerExecutor()
-            self.conf_dict = yaml.safe_load(f)
+        self.config_file = config_file
+        self.config = get_config(config_file)
        self.executor._init_from_path(
-            am=self.conf_dict["am"],
+            am=self.config.am,
-            am_model=self.conf_dict["am_model"],
+            am_model=self.config.am_model,
-            am_params=self.conf_dict["am_params"],
+            am_params=self.config.am_params,
-            am_sample_rate=self.conf_dict["am_sample_rate"],
+            am_sample_rate=self.config.am_sample_rate,
-            phones_dict=self.conf_dict["phones_dict"],
+            phones_dict=self.config.phones_dict,
-            tones_dict=self.conf_dict["tones_dict"],
+            tones_dict=self.config.tones_dict,
-            speaker_dict=self.conf_dict["speaker_dict"],
+            speaker_dict=self.config.speaker_dict,
-            voc=self.conf_dict["voc"],
+            voc=self.config.voc,
-            voc_model=self.conf_dict["voc_model"],
+            voc_model=self.config.voc_model,
-            voc_params=self.conf_dict["voc_params"],
+            voc_params=self.config.voc_params,
-            voc_sample_rate=self.conf_dict["voc_sample_rate"],
+            voc_sample_rate=self.config.voc_sample_rate,
-            lang=self.conf_dict["lang"],
+            lang=self.config.lang,
-            am_predictor_conf=self.conf_dict["am_predictor_conf"],
+            am_predictor_conf=self.config.am_predictor_conf,
-            voc_predictor_conf=self.conf_dict["voc_predictor_conf"], )
+            voc_predictor_conf=self.config.voc_predictor_conf, )
        logger.info("Initialize TTS server engine successfully.")
@@ -382,6 +377,13 @@ class TTSEngine(BaseEngine):
            target_fs (int): target audio sample rate
            volume (float): target volume
            speed (float): target speed
+        Raises:
+            ServerBaseException: Throws an exception if the change speed unsuccessfully.
+        Returns:
+            target_fs: target sample rate for synthesized audio.
+            wav_base64: The base64 format of the synthesized audio.
        """
        # transform sample_rate
@@ -440,21 +442,20 @@ class TTSEngine(BaseEngine):
            save_path (str, optional): The save path of the synthesized audio. Defaults to None.
        Raises:
-            ServerBaseException: Exception
+            ServerBaseException: Throws an exception if tts inference unsuccessfully.
-            ServerBaseException: Exception
+            ServerBaseException: Throws an exception if postprocess unsuccessfully.
        Returns:
-            lang, target_sample_rate, wav_base64
+            lang: model language 
+            target_sample_rate: target sample rate for synthesized audio.
+            wav_base64: The base64 format of the synthesized audio.
        """
-        lang = self.conf_dict["lang"]
+        lang = self.config.lang
        try:
            self.executor.infer(
-                text=sentence,
+                text=sentence, lang=lang, am=self.config.am, spk_id=spk_id)
-                lang=lang,
-                am=self.conf_dict["am"],
-                spk_id=spk_id)
        except:
            raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
                                      "tts infer failed.")

--- a/speechserving/speechserving/engine/tts/python/tts_engine.py
+++ b/speechserving/speechserving/engine/tts/python/tts_engine.py
@@ -11,20 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
 import base64
 import io
 import librosa
 import numpy as np
 import soundfile as sf
-import yaml
 from engine.base_engine import BaseEngine
 from scipy.io import wavfile
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
 from utils.audio_process import change_speed
+from utils.config import get_config
 from utils.errors import ErrorCode
 from utils.exception import ServerBaseException
@@ -34,14 +33,7 @@ __all__ = ['TTSEngine']
 class TTSServerExecutor(TTSExecutor):
    def __init__(self):
        super().__init__()
+        pass
-        self.parser = argparse.ArgumentParser(
-            prog='paddlespeech.tts', add_help=True)
-        self.parser.add_argument(
-            '--conf',
-            type=str,
-            default='./conf/tts/tts.yaml',
-            help='Configuration parameters.')
 class TTSEngine(BaseEngine):
@@ -55,25 +47,25 @@ class TTSEngine(BaseEngine):
        """Initialize TTS server engine
        """
        super(TTSEngine, self).__init__()
-        self.executor = TTSServerExecutor()
-        config_path = self.executor.parser.parse_args().conf
+    def init(self, config_file: str):
-        with open(config_path, 'rt') as f:
+        self.executor = TTSServerExecutor()
-            self.conf_dict = yaml.safe_load(f)
+        self.config_file = config_file
+        self.config = get_config(config_file)
        self.executor._init_from_path(
-            am=self.conf_dict["am"],
+            am=self.config.am,
-            am_config=self.conf_dict["am_config"],
+            am_config=self.config.am_config,
-            am_ckpt=self.conf_dict["am_ckpt"],
+            am_ckpt=self.config.am_ckpt,
-            am_stat=self.conf_dict["am_stat"],
+            am_stat=self.config.am_stat,
-            phones_dict=self.conf_dict["phones_dict"],
+            phones_dict=self.config.phones_dict,
-            tones_dict=self.conf_dict["tones_dict"],
+            tones_dict=self.config.tones_dict,
-            speaker_dict=self.conf_dict["speaker_dict"],
+            speaker_dict=self.config.speaker_dict,
-            voc=self.conf_dict["voc"],
+            voc=self.config.voc,
-            voc_config=self.conf_dict["voc_config"],
+            voc_config=self.config.voc_config,
-            voc_ckpt=self.conf_dict["voc_ckpt"],
+            voc_ckpt=self.config.voc_ckpt,
-            voc_stat=self.conf_dict["voc_stat"],
+            voc_stat=self.config.voc_stat,
-            lang=self.conf_dict["lang"])
+            lang=self.config.lang)
        logger.info("Initialize TTS server engine successfully.")
@@ -92,6 +84,13 @@ class TTSEngine(BaseEngine):
            target_fs (int): target audio sample rate
            volume (float): target volume
            speed (float): target speed
+        Raises:
+            ServerBaseException: Throws an exception if the change speed unsuccessfully.
+        Returns:
+            target_fs: target sample rate for synthesized audio.
+            wav_base64: The base64 format of the synthesized audio.
        """
        # transform sample_rate
@@ -137,15 +136,33 @@ class TTSEngine(BaseEngine):
            volume: float=1.0,
            sample_rate: int=0,
            save_path: str=None):
+        """ run include inference and postprocess.
+        Args:
+            sentence (str): text to be synthesized
+            spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
+            speed (float, optional): speed. Defaults to 1.0.
+            volume (float, optional): volume. Defaults to 1.0.
+            sample_rate (int, optional): target sample rate for synthesized audio, 
+            0 means the same as the model sampling rate. Defaults to 0.
+            save_path (str, optional): The save path of the synthesized audio. 
+            None means do not save audio. Defaults to None.
+        Raises:
+            ServerBaseException: Throws an exception if tts inference unsuccessfully.
+            ServerBaseException: Throws an exception if postprocess unsuccessfully.
+        Returns:
+            lang: model language 
+            target_sample_rate: target sample rate for synthesized audio.
+            wav_base64: The base64 format of the synthesized audio.
+        """
-        lang = self.conf_dict["lang"]
+        lang = self.config.lang
        try:
            self.executor.infer(
-                text=sentence,
+                text=sentence, lang=lang, am=self.config.am, spk_id=spk_id)
-                lang=lang,
-                am=self.conf_dict["am"],
-                spk_id=spk_id)
        except:
            raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
                                      "tts infer failed.")

--- a/speechserving/speechserving/utils/audio_process.py
+++ b/speechserving/speechserving/utils/audio_process.py
@@ -15,8 +15,17 @@ import wave
 import numpy as np
+from paddlespeech.cli.log import logger
 def wav2pcm(wavfile, pcmfile, data_type=np.int16):
+    """ Save the wav file as a pcm file
+    Args:
+        wavfile (str): wav file path
+        pcmfile (str): pcm file save path
+        data_type (type, optional): pcm sample type. Defaults to np.int16.
+    """
    with open(wavfile, "rb") as f:
        f.seek(0)
        f.read(44)
@@ -25,12 +34,21 @@ def wav2pcm(wavfile, pcmfile, data_type=np.int16):
 def pcm2wav(pcm_file, wav_file, channels=1, bits=16, sample_rate=16000):
+    """Save the pcm file as a wav file
+    Args:
+        pcm_file (str): pcm file path
+        wav_file (str): wav file save path
+        channels (int, optional): audio channel. Defaults to 1.
+        bits (int, optional): Bit depth. Defaults to 16.
+        sample_rate (int, optional): sample rate. Defaults to 16000.
+    """
    pcmf = open(pcm_file, 'rb')
    pcmdata = pcmf.read()
    pcmf.close()
    if bits % 8 != 0:
-        raise ValueError("bits % 8 must == 0. now bits:" + str(bits))
+        logger.error("bits % 8 must == 0. now bits:" + str(bits))
    wavfile = wave.open(wav_file, 'wb')
    wavfile.setnchannels(channels)

--- a/speechserving/tests/tts/test_client.py
+++ b/speechserving/tests/tts/test_client.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import base64
+import io
+import json
+import os
+import random
+import time
+import numpy as np
+import requests
+import soundfile
+def wav2pcm(wavfile: str, pcmfile: str, data_type=np.int16):
+    with open(wavfile, "rb") as f:
+        f.seek(0)
+        f.read(44)
+        data = np.fromfile(f, dtype=data_type)
+        data.tofile(pcmfile)
+# Request and response
+def tts_client(args):
+    """ Request and response
+    Args:
+        text: A sentence to be synthesized
+        outfile: Synthetic audio file
+    """
+    url = 'http://127.0.0.1:8090/paddlespeech/tts'
+    request = {
+        "text": args.text,
+        "spk_id": args.spk_id,
+        "speed": args.speed,
+        "volume": args.volume,
+        "sample_rate": args.sample_rate,
+        "save_path": args.output
+    }
+    response = requests.post(url, json.dumps(request))
+    response_dict = response.json()
+    wav_base64 = response_dict["result"]["audio"]
+    audio_data_byte = base64.b64decode(wav_base64)
+    # from byte
+    samples, sample_rate = soundfile.read(
+        io.BytesIO(audio_data_byte), dtype='float32')
+    # transform audio
+    outfile = args.output
+    if outfile.endswith(".wav"):
+        soundfile.write(outfile, samples, sample_rate)
+    elif outfile.endswith(".pcm"):
+        temp_wav = str(random.getrandbits(128)) + ".wav"
+        soundfile.write(temp_wav, samples, sample_rate)
+        wav2pcm(temp_wav, outfile, data_type=np.int16)
+        os.system("rm %s" % (temp_wav))
+    else:
+        print("The format for saving audio only supports wav or pcm")
+    return len(samples), sample_rate
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--text',
+        type=str,
+        default="你好，欢迎使用语音合成服务",
+        help='A sentence to be synthesized')
+    parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
+    parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')
+    parser.add_argument(
+        '--volume', type=float, default=1.0, help='Audio volume')
+    parser.add_argument(
+        '--sample_rate',
+        type=int,
+        default=0,
+        help='Sampling rate, the default is the same as the model')
+    parser.add_argument(
+        '--output',
+        type=str,
+        default="./out.wav",
+        help='Synthesized audio file')
+    args = parser.parse_args()
+    st = time.time()
+    try:
+        samples_length, sample_rate = tts_client(args)
+        time_consume = time.time() - st
+        duration = samples_length / sample_rate
+        rtf = time_consume / duration
+        print("Synthesized audio successfully.")
+        print("Inference time: %f" % (time_consume))
+        print("The duration of synthesized audio: %f" % (duration))
+        print("The RTF is: %f" % (rtf))
+    except:
+        print("Failed to synthesized audio.")