# tts 推理引擎,支持流式与非流式 # 精简化使用 # 用 onnxruntime 进行推理 # 1. 下载对应的模型 # 2. 加载模型 # 3. 端到端推理 # 4. 流式推理 import base64 import numpy as np from paddlespeech.server.utils.onnx_infer import get_sess from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.server.utils.util import denorm, get_chunks from paddlespeech.server.utils.audio_process import float2pcm from paddlespeech.server.utils.config import get_config from paddlespeech.server.engine.tts.online.onnx.tts_engine import TTSEngine class TTS: def __init__(self, config_path): self.config = get_config(config_path)['tts_online-onnx'] self.config['voc_block'] = 36 self.engine = TTSEngine() self.engine.init(self.config) self.engine.warm_up() # 前端初始化 self.frontend = Frontend( phone_vocab_path=self.engine.executor.phones_dict, tone_vocab_path=None) def depadding(self, data, chunk_num, chunk_id, block, pad, upsample): """ Streaming inference removes the result of pad inference """ front_pad = min(chunk_id * block, pad) # first chunk if chunk_id == 0: data = data[:block * upsample] # last chunk elif chunk_id == chunk_num - 1: data = data[front_pad * upsample:] # middle chunk else: data = data[front_pad * upsample:(front_pad + block) * upsample] return data def offlineTTS(self, text): get_tone_ids = False merge_sentences = False input_ids = self.frontend.get_input_ids( text, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids) phone_ids = input_ids["phone_ids"] wav_list = [] for i in range(len(phone_ids)): orig_hs = self.engine.executor.am_encoder_infer_sess.run( None, input_feed={'text': phone_ids[i].numpy()} ) hs = orig_hs[0] am_decoder_output = self.engine.executor.am_decoder_sess.run( None, input_feed={'xs': hs}) am_postnet_output = self.engine.executor.am_postnet_sess.run( None, input_feed={ 'xs': np.transpose(am_decoder_output[0], (0, 2, 1)) }) am_output_data = am_decoder_output + np.transpose( am_postnet_output[0], (0, 2, 1)) normalized_mel = am_output_data[0][0] mel = denorm(normalized_mel, self.engine.executor.am_mu, self.engine.executor.am_std) wav = self.engine.executor.voc_sess.run( output_names=None, input_feed={'logmel': mel})[0] wav_list.append(wav) wavs = np.concatenate(wav_list) return wavs def streamTTS(self, text): for sub_wav_base64 in self.engine.run(sentence=text): yield sub_wav_base64 def streamTTSBytes(self, text): for wav in self.engine.executor.infer( text=text, lang=self.engine.config.lang, am=self.engine.config.am, spk_id=0): wav = float2pcm(wav) # float32 to int16 wav_bytes = wav.tobytes() # to bytes yield wav_bytes def after_process(self, wav): # for tvm wav = float2pcm(wav) # float32 to int16 wav_bytes = wav.tobytes() # to bytes wav_base64 = base64.b64encode(wav_bytes).decode('utf8') # to base64 return wav_base64 def streamTTS_TVM(self, text): # 用 TVM 优化 pass if __name__ == '__main__': text = "啊哈哈哈哈哈哈啊哈哈哈哈哈哈啊哈哈哈哈哈哈啊哈哈哈哈哈哈啊哈哈哈哈哈哈" config_path="../../PaddleSpeech/demos/streaming_tts_server/conf/tts_online_application.yaml" tts = TTS(config_path) for sub_wav in tts.streamTTS(text): print("sub_wav_base64: ", len(sub_wav)) end_wav = tts.offlineTTS(text) print(end_wav)