diff --git a/setup.py b/setup.py index a6b18f9793163f65f8531be330586f9e26c778e1..cdb899e426ce82f6ea356fddf8b53d29cf17ed0b 100644 --- a/setup.py +++ b/setup.py @@ -61,6 +61,9 @@ requirements = { "visualdl", "webrtcvad", "yacs~=0.1.8", + # fastapi server + "fastapi", + "uvicorn", ], "develop": [ "ConfigArgParse", diff --git a/speechserving/speechserving/conf/application.yaml b/speechserving/speechserving/conf/application.yaml index 29b40b15826cd3d1f18eaa427766e1610484dc7a..8c4d9bc62d2ca4dac2676c48ca7154e85e414c35 100644 --- a/speechserving/speechserving/conf/application.yaml +++ b/speechserving/speechserving/conf/application.yaml @@ -11,4 +11,5 @@ port: 8090 ################################################################## # add engine type (Options: asr, tts) and config file here. engine_backend: - asr: 'conf/asr/asr.yaml' \ No newline at end of file + asr: 'conf/asr/asr.yaml' + tts: 'conf/tts/tts.yaml' diff --git a/speechserving/speechserving/engine/tts/python/tts_engine.py b/speechserving/speechserving/engine/tts/python/tts_engine.py index 65e35fb8fe77bd86b33d9fee91de3a70499c1fc1..e8d42619bdddcbedf67831d47f9831680dbfe3f2 100644 --- a/speechserving/speechserving/engine/tts/python/tts_engine.py +++ b/speechserving/speechserving/engine/tts/python/tts_engine.py @@ -13,19 +13,18 @@ # limitations under the License. import argparse import base64 -import os -import random +import io import librosa import numpy as np import soundfile as sf import yaml from engine.base_engine import BaseEngine -from ffmpeg import audio +from scipy.io import wavfile from paddlespeech.cli.log import logger from paddlespeech.cli.tts.infer import TTSExecutor -from utils.audio_types import wav2pcm +from utils.audio_process import change_speed from utils.errors import ErrorCode from utils.exception import ServerBaseException @@ -107,26 +106,27 @@ class TTSEngine(BaseEngine): wav_vol = wav_tar_fs * volume # transform speed - hash = random.getrandbits(128) - temp_wav = str(hash) + ".wav" - temp_speed_wav = str(hash + 1) + ".wav" - sf.write(temp_wav, wav_vol.reshape(-1, 1), target_fs) - audio.a_speed(temp_wav, speed, temp_speed_wav) - os.system("rm %s" % (temp_wav)) + try: # windows not support soxbindings + wav_speed = change_speed(wav_vol, speed, target_fs) + except: + raise ServerBaseException( + ErrorCode.SERVER_INTERNAL_ERR, + "Can not install soxbindings on your system.") # wav to base64 - with open(temp_speed_wav, 'rb') as f: - base64_bytes = base64.b64encode(f.read()) - wav_base64 = base64_bytes.decode('utf-8') + buf = io.BytesIO() + wavfile.write(buf, target_fs, wav_speed) + base64_bytes = base64.b64encode(buf.read()) + wav_base64 = base64_bytes.decode('utf-8') # save audio if audio_path is not None and audio_path.endswith(".wav"): - os.system("mv %s %s" % (temp_speed_wav, audio_path)) + sf.write(audio_path, wav_speed, target_fs) elif audio_path is not None and audio_path.endswith(".pcm"): - wav2pcm(temp_speed_wav, audio_path, data_type=np.int16) - os.system("rm %s" % (temp_speed_wav)) - else: - os.system("rm %s" % (temp_speed_wav)) + wav_norm = wav_speed * (32767 / max(0.001, + np.max(np.abs(wav_speed)))) + with open(audio_path, "wb") as f: + f.write(wav_norm.astype(np.int16)) return target_fs, wav_base64 diff --git a/speechserving/speechserving/utils/audio_process.py b/speechserving/speechserving/utils/audio_process.py new file mode 100644 index 0000000000000000000000000000000000000000..51a19b3600a5295bde1a41e92c6dd32fe681553f --- /dev/null +++ b/speechserving/speechserving/utils/audio_process.py @@ -0,0 +1,87 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import wave + +import numpy as np + + +def wav2pcm(wavfile, pcmfile, data_type=np.int16): + f = open(wavfile, "rb") + f.seek(0) + f.read(44) + data = np.fromfile(f, dtype=data_type) + data.tofile(pcmfile) + + +def pcm2wav(pcm_file, wav_file, channels=1, bits=16, sample_rate=16000): + pcmf = open(pcm_file, 'rb') + pcmdata = pcmf.read() + pcmf.close() + + if bits % 8 != 0: + raise ValueError("bits % 8 must == 0. now bits:" + str(bits)) + + wavfile = wave.open(wav_file, 'wb') + wavfile.setnchannels(channels) + wavfile.setsampwidth(bits // 8) + wavfile.setframerate(sample_rate) + wavfile.writeframes(pcmdata) + wavfile.close() + + +def change_speed(sample_raw, speed_rate, sample_rate): + """Change the audio speed by linear interpolation. + Note that this is an in-place transformation. + :param speed_rate: Rate of speed change: + speed_rate > 1.0, speed up the audio; + speed_rate = 1.0, unchanged; + speed_rate < 1.0, slow down the audio; + speed_rate <= 0.0, not allowed, raise ValueError. + :type speed_rate: float + :raises ValueError: If speed_rate <= 0.0. + """ + if speed_rate == 1.0: + return + if speed_rate <= 0: + raise ValueError("speed_rate should be greater than zero.") + + # numpy + # old_length = self._samples.shape[0] + # new_length = int(old_length / speed_rate) + # old_indices = np.arange(old_length) + # new_indices = np.linspace(start=0, stop=old_length, num=new_length) + # self._samples = np.interp(new_indices, old_indices, self._samples) + + # sox, slow + try: + import soxbindings as sox + except ImportError: + try: + from paddlespeech.s2t.utils import dynamic_pip_install + package = "sox" + dynamic_pip_install.install(package) + package = "soxbindings" + dynamic_pip_install.install(package) + import soxbindings as sox + except Exception: + raise RuntimeError("Can not install soxbindings on your system.") + + tfm = sox.Transformer() + tfm.set_globals(multithread=False) + tfm.tempo(speed_rate) + sample_speed = tfm.build_array( + input_array=sample_raw, + sample_rate_in=sample_rate).squeeze(-1).astype(np.float32).copy() + + return sample_speed diff --git a/speechserving/speechserving/utils/audio_types.py b/speechserving/speechserving/utils/audio_types.py deleted file mode 100644 index eb655ddd5902d27fb91fbc0718f7362400af91b4..0000000000000000000000000000000000000000 --- a/speechserving/speechserving/utils/audio_types.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import wave - -import numpy as np - - -def wav2pcm(wavfile, pcmfile, data_type=np.int16): - f = open(wavfile, "rb") - f.seek(0) - f.read(44) - data = np.fromfile(f, dtype=data_type) - data.tofile(pcmfile) - - -def pcm2wav(pcm_file, wav_file, channels=1, bits=16, sample_rate=16000): - pcmf = open(pcm_file, 'rb') - pcmdata = pcmf.read() - pcmf.close() - - if bits % 8 != 0: - raise ValueError("bits % 8 must == 0. now bits:" + str(bits)) - - wavfile = wave.open(wav_file, 'wb') - wavfile.setnchannels(channels) - wavfile.setsampwidth(bits // 8) - wavfile.setframerate(sample_rate) - wavfile.writeframes(pcmdata) - wavfile.close()