diff --git a/speechserving/speechserving/conf/tts/tts.yaml b/speechserving/speechserving/conf/tts/tts.yaml index 8e08d51c128d3b21c5c21f651831007905ff9218..d0e128eaee0c14783d23867563ee0275fbceef1b 100644 --- a/speechserving/speechserving/conf/tts/tts.yaml +++ b/speechserving/speechserving/conf/tts/tts.yaml @@ -1,11 +1,5 @@ # This is the parameter configuration file for TTS server. -################################################################## -# TTS SERVER SETTING # -################################################################## -host: '0.0.0.0' -port: 8692 - ################################################################## # ACOUSTIC MODEL SETTING # # am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', diff --git a/speechserving/speechserving/conf/tts/tts_pd.yaml b/speechserving/speechserving/conf/tts/tts_pd.yaml index eebd1f124003e488483c155a33d3b3cf70992965..c268c6a336bb21be7879980cb3cb3c59611d64cd 100644 --- a/speechserving/speechserving/conf/tts/tts_pd.yaml +++ b/speechserving/speechserving/conf/tts/tts_pd.yaml @@ -1,12 +1,6 @@ # This is the parameter configuration file for TTS server. # These are the static models that support paddle inference. -################################################################## -# TTS SERVER SETTING # -################################################################## -host: '0.0.0.0' -port: 8692 - ################################################################## # ACOUSTIC MODEL SETTING # # am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'] diff --git a/speechserving/speechserving/engine/tts/paddleinference/tts_engine.py b/speechserving/speechserving/engine/tts/paddleinference/tts_engine.py index 5a447c08ba5329dae88166ebbe8e1c437b71e2ad..2ab8bcb45bba124302e3f31d58235525a1bb98f7 100644 --- a/speechserving/speechserving/engine/tts/paddleinference/tts_engine.py +++ b/speechserving/speechserving/engine/tts/paddleinference/tts_engine.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import argparse import base64 import io import os @@ -21,7 +20,6 @@ import librosa import numpy as np import paddle import soundfile as sf -import yaml from engine.base_engine import BaseEngine from scipy.io import wavfile @@ -32,6 +30,7 @@ from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend.zh_frontend import Frontend from utils.audio_process import change_speed +from utils.config import get_config from utils.errors import ErrorCode from utils.exception import ServerBaseException from utils.paddle_predictor import init_predictor @@ -118,14 +117,7 @@ pretrained_models = { class TTSServerExecutor(TTSExecutor): def __init__(self): super().__init__() - - self.parser = argparse.ArgumentParser( - prog='paddlespeech.tts', add_help=True) - self.parser.add_argument( - '--conf', - type=str, - default='./conf/tts/tts_pd.yaml', - help='Configuration parameters.') + pass def _get_pretrained_path(self, tag: str) -> os.PathLike: """ @@ -224,7 +216,10 @@ class TTSServerExecutor(TTSExecutor): self.voc_sample_rate = voc_sample_rate self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_model)) - assert (self.voc_sample_rate == self.am_sample_rate) + assert ( + self.voc_sample_rate == self.am_sample_rate + ), "The sample rate of AM and Vocoder model are different, please check model." + # Init body. with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] @@ -339,31 +334,31 @@ class TTSEngine(BaseEngine): metaclass: Defaults to Singleton. """ - def __init__(self, name=None): + def __init__(self): """Initialize TTS server engine """ super(TTSEngine, self).__init__() - self.executor = TTSServerExecutor() - config_path = self.executor.parser.parse_args().conf - with open(config_path, 'rt') as f: - self.conf_dict = yaml.safe_load(f) + def init(self, config_file: str): + self.executor = TTSServerExecutor() + self.config_file = config_file + self.config = get_config(config_file) self.executor._init_from_path( - am=self.conf_dict["am"], - am_model=self.conf_dict["am_model"], - am_params=self.conf_dict["am_params"], - am_sample_rate=self.conf_dict["am_sample_rate"], - phones_dict=self.conf_dict["phones_dict"], - tones_dict=self.conf_dict["tones_dict"], - speaker_dict=self.conf_dict["speaker_dict"], - voc=self.conf_dict["voc"], - voc_model=self.conf_dict["voc_model"], - voc_params=self.conf_dict["voc_params"], - voc_sample_rate=self.conf_dict["voc_sample_rate"], - lang=self.conf_dict["lang"], - am_predictor_conf=self.conf_dict["am_predictor_conf"], - voc_predictor_conf=self.conf_dict["voc_predictor_conf"], ) + am=self.config.am, + am_model=self.config.am_model, + am_params=self.config.am_params, + am_sample_rate=self.config.am_sample_rate, + phones_dict=self.config.phones_dict, + tones_dict=self.config.tones_dict, + speaker_dict=self.config.speaker_dict, + voc=self.config.voc, + voc_model=self.config.voc_model, + voc_params=self.config.voc_params, + voc_sample_rate=self.config.voc_sample_rate, + lang=self.config.lang, + am_predictor_conf=self.config.am_predictor_conf, + voc_predictor_conf=self.config.voc_predictor_conf, ) logger.info("Initialize TTS server engine successfully.") @@ -382,6 +377,13 @@ class TTSEngine(BaseEngine): target_fs (int): target audio sample rate volume (float): target volume speed (float): target speed + + Raises: + ServerBaseException: Throws an exception if the change speed unsuccessfully. + + Returns: + target_fs: target sample rate for synthesized audio. + wav_base64: The base64 format of the synthesized audio. """ # transform sample_rate @@ -440,21 +442,20 @@ class TTSEngine(BaseEngine): save_path (str, optional): The save path of the synthesized audio. Defaults to None. Raises: - ServerBaseException: Exception - ServerBaseException: Exception + ServerBaseException: Throws an exception if tts inference unsuccessfully. + ServerBaseException: Throws an exception if postprocess unsuccessfully. Returns: - lang, target_sample_rate, wav_base64 + lang: model language + target_sample_rate: target sample rate for synthesized audio. + wav_base64: The base64 format of the synthesized audio. """ - lang = self.conf_dict["lang"] + lang = self.config.lang try: self.executor.infer( - text=sentence, - lang=lang, - am=self.conf_dict["am"], - spk_id=spk_id) + text=sentence, lang=lang, am=self.config.am, spk_id=spk_id) except: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts infer failed.") diff --git a/speechserving/speechserving/engine/tts/python/tts_engine.py b/speechserving/speechserving/engine/tts/python/tts_engine.py index e8d42619bdddcbedf67831d47f9831680dbfe3f2..32a0dca32d156e85601142a0b0e1e137f6779b16 100644 --- a/speechserving/speechserving/engine/tts/python/tts_engine.py +++ b/speechserving/speechserving/engine/tts/python/tts_engine.py @@ -11,20 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import argparse import base64 import io import librosa import numpy as np import soundfile as sf -import yaml from engine.base_engine import BaseEngine from scipy.io import wavfile from paddlespeech.cli.log import logger from paddlespeech.cli.tts.infer import TTSExecutor from utils.audio_process import change_speed +from utils.config import get_config from utils.errors import ErrorCode from utils.exception import ServerBaseException @@ -34,14 +33,7 @@ __all__ = ['TTSEngine'] class TTSServerExecutor(TTSExecutor): def __init__(self): super().__init__() - - self.parser = argparse.ArgumentParser( - prog='paddlespeech.tts', add_help=True) - self.parser.add_argument( - '--conf', - type=str, - default='./conf/tts/tts.yaml', - help='Configuration parameters.') + pass class TTSEngine(BaseEngine): @@ -55,25 +47,25 @@ class TTSEngine(BaseEngine): """Initialize TTS server engine """ super(TTSEngine, self).__init__() - self.executor = TTSServerExecutor() - config_path = self.executor.parser.parse_args().conf - with open(config_path, 'rt') as f: - self.conf_dict = yaml.safe_load(f) + def init(self, config_file: str): + self.executor = TTSServerExecutor() + self.config_file = config_file + self.config = get_config(config_file) self.executor._init_from_path( - am=self.conf_dict["am"], - am_config=self.conf_dict["am_config"], - am_ckpt=self.conf_dict["am_ckpt"], - am_stat=self.conf_dict["am_stat"], - phones_dict=self.conf_dict["phones_dict"], - tones_dict=self.conf_dict["tones_dict"], - speaker_dict=self.conf_dict["speaker_dict"], - voc=self.conf_dict["voc"], - voc_config=self.conf_dict["voc_config"], - voc_ckpt=self.conf_dict["voc_ckpt"], - voc_stat=self.conf_dict["voc_stat"], - lang=self.conf_dict["lang"]) + am=self.config.am, + am_config=self.config.am_config, + am_ckpt=self.config.am_ckpt, + am_stat=self.config.am_stat, + phones_dict=self.config.phones_dict, + tones_dict=self.config.tones_dict, + speaker_dict=self.config.speaker_dict, + voc=self.config.voc, + voc_config=self.config.voc_config, + voc_ckpt=self.config.voc_ckpt, + voc_stat=self.config.voc_stat, + lang=self.config.lang) logger.info("Initialize TTS server engine successfully.") @@ -92,6 +84,13 @@ class TTSEngine(BaseEngine): target_fs (int): target audio sample rate volume (float): target volume speed (float): target speed + + Raises: + ServerBaseException: Throws an exception if the change speed unsuccessfully. + + Returns: + target_fs: target sample rate for synthesized audio. + wav_base64: The base64 format of the synthesized audio. """ # transform sample_rate @@ -137,15 +136,33 @@ class TTSEngine(BaseEngine): volume: float=1.0, sample_rate: int=0, save_path: str=None): + """ run include inference and postprocess. + + Args: + sentence (str): text to be synthesized + spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0. + speed (float, optional): speed. Defaults to 1.0. + volume (float, optional): volume. Defaults to 1.0. + sample_rate (int, optional): target sample rate for synthesized audio, + 0 means the same as the model sampling rate. Defaults to 0. + save_path (str, optional): The save path of the synthesized audio. + None means do not save audio. Defaults to None. + + Raises: + ServerBaseException: Throws an exception if tts inference unsuccessfully. + ServerBaseException: Throws an exception if postprocess unsuccessfully. + + Returns: + lang: model language + target_sample_rate: target sample rate for synthesized audio. + wav_base64: The base64 format of the synthesized audio. + """ - lang = self.conf_dict["lang"] + lang = self.config.lang try: self.executor.infer( - text=sentence, - lang=lang, - am=self.conf_dict["am"], - spk_id=spk_id) + text=sentence, lang=lang, am=self.config.am, spk_id=spk_id) except: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts infer failed.") diff --git a/speechserving/speechserving/utils/audio_process.py b/speechserving/speechserving/utils/audio_process.py index 78f120a60324573fb6bc196919dedb06b90720a5..3cbb495a67ffcb54444fd44173571eccb02addef 100644 --- a/speechserving/speechserving/utils/audio_process.py +++ b/speechserving/speechserving/utils/audio_process.py @@ -15,8 +15,17 @@ import wave import numpy as np +from paddlespeech.cli.log import logger + def wav2pcm(wavfile, pcmfile, data_type=np.int16): + """ Save the wav file as a pcm file + + Args: + wavfile (str): wav file path + pcmfile (str): pcm file save path + data_type (type, optional): pcm sample type. Defaults to np.int16. + """ with open(wavfile, "rb") as f: f.seek(0) f.read(44) @@ -25,12 +34,21 @@ def wav2pcm(wavfile, pcmfile, data_type=np.int16): def pcm2wav(pcm_file, wav_file, channels=1, bits=16, sample_rate=16000): + """Save the pcm file as a wav file + + Args: + pcm_file (str): pcm file path + wav_file (str): wav file save path + channels (int, optional): audio channel. Defaults to 1. + bits (int, optional): Bit depth. Defaults to 16. + sample_rate (int, optional): sample rate. Defaults to 16000. + """ pcmf = open(pcm_file, 'rb') pcmdata = pcmf.read() pcmf.close() if bits % 8 != 0: - raise ValueError("bits % 8 must == 0. now bits:" + str(bits)) + logger.error("bits % 8 must == 0. now bits:" + str(bits)) wavfile = wave.open(wav_file, 'wb') wavfile.setnchannels(channels) diff --git a/speechserving/tests/tts/test_client.py b/speechserving/tests/tts/test_client.py new file mode 100644 index 0000000000000000000000000000000000000000..71fb36d977fa08e27dafd53f368ff58ba43cbe3a --- /dev/null +++ b/speechserving/tests/tts/test_client.py @@ -0,0 +1,110 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import base64 +import io +import json +import os +import random +import time + +import numpy as np +import requests +import soundfile + + +def wav2pcm(wavfile: str, pcmfile: str, data_type=np.int16): + with open(wavfile, "rb") as f: + f.seek(0) + f.read(44) + data = np.fromfile(f, dtype=data_type) + data.tofile(pcmfile) + + +# Request and response +def tts_client(args): + """ Request and response + Args: + text: A sentence to be synthesized + outfile: Synthetic audio file + """ + url = 'http://127.0.0.1:8090/paddlespeech/tts' + request = { + "text": args.text, + "spk_id": args.spk_id, + "speed": args.speed, + "volume": args.volume, + "sample_rate": args.sample_rate, + "save_path": args.output + } + + response = requests.post(url, json.dumps(request)) + response_dict = response.json() + wav_base64 = response_dict["result"]["audio"] + + audio_data_byte = base64.b64decode(wav_base64) + # from byte + samples, sample_rate = soundfile.read( + io.BytesIO(audio_data_byte), dtype='float32') + + # transform audio + outfile = args.output + if outfile.endswith(".wav"): + soundfile.write(outfile, samples, sample_rate) + elif outfile.endswith(".pcm"): + temp_wav = str(random.getrandbits(128)) + ".wav" + soundfile.write(temp_wav, samples, sample_rate) + wav2pcm(temp_wav, outfile, data_type=np.int16) + os.system("rm %s" % (temp_wav)) + else: + print("The format for saving audio only supports wav or pcm") + + return len(samples), sample_rate + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--text', + type=str, + default="你好,欢迎使用语音合成服务", + help='A sentence to be synthesized') + parser.add_argument('--spk_id', type=int, default=0, help='Speaker id') + parser.add_argument('--speed', type=float, default=1.0, help='Audio speed') + parser.add_argument( + '--volume', type=float, default=1.0, help='Audio volume') + parser.add_argument( + '--sample_rate', + type=int, + default=0, + help='Sampling rate, the default is the same as the model') + parser.add_argument( + '--output', + type=str, + default="./out.wav", + help='Synthesized audio file') + args = parser.parse_args() + + st = time.time() + try: + samples_length, sample_rate = tts_client(args) + time_consume = time.time() - st + duration = samples_length / sample_rate + rtf = time_consume / duration + print("Synthesized audio successfully.") + print("Inference time: %f" % (time_consume)) + print("The duration of synthesized audio: %f" % (duration)) + print("The RTF is: %f" % (rtf)) + except: + print("Failed to synthesized audio.")