diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9751adf2038cae48642073612bf1a9b6f6ea78cf --- /dev/null +++ b/demos/text_to_speech/README.md @@ -0,0 +1,102 @@ +# TTS(Text To Speech) + +## Introduction +Text-to-speech (TTS) is a natural language modeling process that requires changing units of text into units of speech for audio presentation. + +This demo is an implementation to generate an audio from the giving text. It can be done by a single command or a few lines in python using `PaddleSpeech`. + +## Usage +### 1. Installation +```bash +pip install paddlespeech +``` + +### 2. Prepare Input +Input of this demo should be a text of the specific language that can be passed via argument. + + +### 3. Usage +- Command Line(Recommended) + ```bash + paddlespeech tts --input 今天的天气不错啊 + ``` + Usage: + ```bash + paddlespeech tts --help + ``` + Arguments: + - `input`(required): Input text to generate.. + - `am`: Acoustic model type of tts task. Default: `fastspeech2_csmsc`. + - `am_config`: Config of acoustic model. Use deault config when it is None. Default: `None`. + - `am_ckpt`: Acoustic model checkpoint. Use pretrained model when it is None. Default: `None`. + - `am_stat`: Mean and standard deviation used to normalize spectrogram when training acoustic model. Default: `None`. + - `phones_dict`: Phone vocabulary file. Default: `None`. + - `tones_dict`: Tone vocabulary file. Default: `None`. + - `speaker_dict`: speaker id map file. Default: `None`. + - `spk_id`: Speaker id for multi speaker acoustic model. Default: `0`. + - `voc`: Vocoder type of tts task. Default: `pwgan_csmsc`. + - `voc_config`: Config of vocoder. Use deault config when it is None. Default: `None`. + - `voc_ckpt`: Vocoder checkpoint. Use pretrained model when it is None. Default: `None`. + - `voc_stat`: Mean and standard deviation used to normalize spectrogram when training vocoder. Default: `None`. + - `lang`: Language of tts task. Default: `zh`. + - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment. + - `output`: Output wave filepath. Default: `output.wav`. + + Output: + ```bash + [2021-12-09 20:49:58,955] [ INFO] [log.py] [L57] - Wave file has been generated: output.wav + ``` + +- Python API + ```python + import paddle + from paddlespeech.cli import TTSExecutor + + tts_executor = TTSExecutor() + wav_file = tts_executor( + text='今天的天气不错啊', + output='output.wav', + am='fastspeech2_csmsc', + am_config=None, + am_ckpt=None, + am_stat=None, + spk_id=0, + phones_dict=None, + tones_dict=None, + speaker_dict=None, + voc='pwgan_csmsc', + voc_config=None, + voc_ckpt=None, + voc_stat=None, + lang='zh', + device=paddle.get_device()) + print('Wave file has been generated: {}'.format(wav_file)) + ``` + + Output: + ```bash + Wave file has been generated: output.wav + ``` + + +### 4.Pretrained Models + +Here is a list of pretrained models released by PaddleSpeech that can be used by command and python api: + +- Acoustic model + | Model | Language + | :--- | :---: | + | speedyspeech_csmsc| zh + | fastspeech2_csmsc| zh + | fastspeech2_aishell3| zh + | fastspeech2_ljspeech| en + | fastspeech2_vctk| en + +- Vocoder + | Model | Language + | :--- | :---: | + | pwgan_csmsc| zh + | pwgan_aishell3| zh + | pwgan_ljspeech| en + | pwgan_vctk| en + | mb_melgan_csmsc| zh diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index d5eac2b24c67bfb33606c13bae025ac91077533f..8fe5f90ad4adc11822c27de4d776dae3ff84cbc0 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -236,6 +236,7 @@ class TTSExecutor(BaseExecutor): self.parser.add_argument( "--am_stat", type=str, + default=None, help="mean and standard deviation used to normalize spectrogram when training acoustic model." ) self.parser.add_argument( @@ -282,6 +283,7 @@ class TTSExecutor(BaseExecutor): self.parser.add_argument( "--voc_stat", type=str, + default=None, help="mean and standard deviation used to normalize spectrogram when training voc." ) # other @@ -543,6 +545,7 @@ class TTSExecutor(BaseExecutor): Returns: Union[str, os.PathLike]: Human-readable results such as texts and audio files. """ + output = os.path.abspath(os.path.expanduser(output)) sf.write( output, self._outputs['wav'].numpy(), samplerate=self.am_config.fs) return output @@ -593,7 +596,7 @@ class TTSExecutor(BaseExecutor): lang=lang, device=device, output=output) - logger.info('TTS Result Saved in: {}'.format(res)) + logger.info('Wave file has been generated: {}'.format(res)) return True except Exception as e: logger.exception(e) diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py index 8ba780a719f869349f2426f1354a2a71cb7f157e..ee31b771bd7a0e4cbbb2e5d29fa70ebc8e8a2de4 100644 --- a/paddlespeech/cli/utils.py +++ b/paddlespeech/cli/utils.py @@ -56,12 +56,14 @@ def get_command(name: str) -> Any: def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike: file_dir = os.path.dirname(filepath) + is_zip_file = False if tarfile.is_tarfile(filepath): files = tarfile.open(filepath, "r:*") file_list = files.getnames() elif zipfile.is_zipfile(filepath): files = zipfile.ZipFile(filepath, 'r') file_list = files.namelist() + is_zip_file = True else: return file_dir @@ -69,7 +71,10 @@ def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike: rootpath = file_list[0] uncompressed_path = os.path.join(file_dir, rootpath) elif download._is_a_single_dir(file_list): - rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1] + if is_zip_file: + rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0] + else: + rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1] uncompressed_path = os.path.join(file_dir, rootpath) else: rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]