diff --git a/modules/audio/audio_classification/PANNs/cnn10/module.py b/modules/audio/audio_classification/PANNs/cnn10/module.py index 4a45bbe84d78dd967241880d35ee9ca69e3f3e5b..4f474d1f67cbc17ea8b397173019b74bcfda934d 100644 --- a/modules/audio/audio_classification/PANNs/cnn10/module.py +++ b/modules/audio/audio_classification/PANNs/cnn10/module.py @@ -31,7 +31,7 @@ from paddlehub.utils.log import logger name="panns_cnn10", version="1.0.0", summary="", - author="Baidu", + author="paddlepaddle", author_email="", type="audio/sound_classification", meta=AudioClassifierModule) diff --git a/modules/audio/audio_classification/PANNs/cnn14/module.py b/modules/audio/audio_classification/PANNs/cnn14/module.py index eb0efc318192c39b03b810824e0a7fd37071cf01..0bd1826e20b394dfbbf007f3ac5079f3f8727fbc 100644 --- a/modules/audio/audio_classification/PANNs/cnn14/module.py +++ b/modules/audio/audio_classification/PANNs/cnn14/module.py @@ -31,7 +31,7 @@ from paddlehub.utils.log import logger name="panns_cnn14", version="1.0.0", summary="", - author="Baidu", + author="paddlepaddle", author_email="", type="audio/sound_classification", meta=AudioClassifierModule) diff --git a/modules/audio/audio_classification/PANNs/cnn6/module.py b/modules/audio/audio_classification/PANNs/cnn6/module.py index 360cccf2fc0c8092cb6f642f8f56e7cf47049b11..ec70e75d97045743468b3ecaea5de83e2767b49a 100644 --- a/modules/audio/audio_classification/PANNs/cnn6/module.py +++ b/modules/audio/audio_classification/PANNs/cnn6/module.py @@ -31,7 +31,7 @@ from paddlehub.utils.log import logger name="panns_cnn6", version="1.0.0", summary="", - author="Baidu", + author="paddlepaddle", author_email="", type="audio/sound_classification", meta=AudioClassifierModule) diff --git a/modules/audio/voice_cloning/lstm_tacotron2/README.md b/modules/audio/voice_cloning/lstm_tacotron2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..58d6e846a25ddded31a10d6632aaaf6d7563f723 --- /dev/null +++ b/modules/audio/voice_cloning/lstm_tacotron2/README.md @@ -0,0 +1,102 @@ +```shell +$ hub install lstm_tacotron2==1.0.0 +``` + +## 概述 + +声音克隆是指使用特定的音色,结合文字的读音合成音频,使得合成后的音频具有目标说话人的特征,从而达到克隆的目的。 + +在训练语音克隆模型时,目标音色作为Speaker Encoder的输入,模型会提取这段语音的说话人特征(音色)作为Speaker Embedding。接着,在训练模型重新合成此类音色的语音时,除了输入的目标文本外,说话人的特征也将成为额外条件加入模型的训练。 + +在预测时,选取一段新的目标音色作为Speaker Encoder的输入,并提取其说话人特征,最终实现输入为一段文本和一段目标音色,模型生成目标音色说出此段文本的语音片段。 + +![](https://ai-studio-static-online.cdn.bcebos.com/982ab955b87244d3bae3b003aff8e28d9ec159ff0d6246a79757339076dfe7d4) + +`lstm_tacotron2`是一个支持中文的语音克隆模型,分别使用了LSTMSpeakerEncoder、Tacotron2和WaveFlow模型分别用于语音特征提取、目标音频特征合成和语音波形转换。 + +关于模型的详请可参考[Parakeet](https://github.com/PaddlePaddle/Parakeet/tree/release/v0.3/parakeet/models)。 + + +## API + +```python +def __init__(speaker_audio: str = None, + output_dir: str = './') +``` +初始化module,可配置模型的目标音色的音频文件和输出的路径。 + +**参数** +- `speaker_audio`(str): 目标说话人语音音频文件(*.wav)的路径,默认为None(使用默认的女声作为目标音色)。 +- `output_dir`(str): 合成音频的输出文件,默认为当前目录。 + + +```python +def get_speaker_embedding() +``` +获取模型的目标说话人特征。 + +**返回** +* `results`(numpy.ndarray): 长度为256的numpy数组,代表目标说话人的特征。 + +```python +def set_speaker_embedding(speaker_audio: str) +``` +设置模型的目标说话人特征。 + +**参数** +- `speaker_audio`(str): 必填,目标说话人语音音频文件(*.wav)的路径。 + +```python +def generate(data: List[str], batch_size: int = 1, use_gpu: bool = False): +``` +根据输入文字,合成目标说话人的语音音频文件。 + +**参数** +- `data`(List[str]): 必填,目标音频的内容文本列表,目前只支持中文,不支持添加标点符号。 +- `batch_size`(int): 可选,模型合成语音时的batch_size,默认为1。 +- `use_gpu`(bool): 是否使用gpu执行计算,默认为False。 + + +**代码示例** + +```python +import paddlehub as hub + +model = hub.Module(name='lstm_tacotron2', output_dir='./', speaker_audio='/data/man.wav') # 指定目标音色音频文件 +texts = [ + '语音的表现形式在未来将变得越来越重要$', + '今天的天气怎么样$', ] +wavs = model.generate(texts, use_gpu=True) + +for text, wav in zip(texts, wavs): + print('='*30) + print(f'Text: {text}') + print(f'Wav: {wav}') +``` + +输出 +``` +============================== +Text: 语音的表现形式在未来将变得越来越重要$ +Wav: /data/1.wav +============================== +Text: 今天的天气怎么样$ +Wav: /data/2.wav +``` + + +## 查看代码 + +https://github.com/PaddlePaddle/Parakeet + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.1.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 diff --git a/modules/audio/voice_cloning/lstm_tacotron2/module.py b/modules/audio/voice_cloning/lstm_tacotron2/module.py index f4fb2960d3df075c33f8d270f1dba5eae712bb91..8e60afa2bb9a74e4922e99eef219e1816f9968af 100644 --- a/modules/audio/voice_cloning/lstm_tacotron2/module.py +++ b/modules/audio/voice_cloning/lstm_tacotron2/module.py @@ -23,14 +23,9 @@ from paddlehub.env import MODULE_HOME from paddlehub.module.module import moduleinfo from paddlehub.utils.log import logger from paddlenlp.data import Pad -import soundfile as sf - -if not importlib.util.find_spec('parakeet'): - raise ImportError('The module requires additional dependencies: "parakeet".\n' - 'You can install parakeet via "git clone https://github.com' - '/PaddlePaddle/Parakeet -b release/v0.3 && pip install -e Parakeet"') from parakeet.models import ConditionalWaveFlow, Tacotron2 from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder +import soundfile as sf from .audio_processor import SpeakerVerificationPreprocessor from .chinese_g2p import convert_sentence @@ -41,7 +36,7 @@ from .preprocess_transcription import voc_phones, voc_tones, phone_pad_token, to name="lstm_tacotron2", version="1.0.0", summary="", - author="Baidu", + author="paddlepaddle", author_email="", type="audio/voice_cloning", ) diff --git a/modules/audio/voice_cloning/lstm_tacotron2/preprocess_transcription.py b/modules/audio/voice_cloning/lstm_tacotron2/preprocess_transcription.py index 715121d030dfe3860d7b037b1f0d8f81a5e3942f..5c88cb4c71af42d8479eb78e6b0b667f4d64fbac 100644 --- a/modules/audio/voice_cloning/lstm_tacotron2/preprocess_transcription.py +++ b/modules/audio/voice_cloning/lstm_tacotron2/preprocess_transcription.py @@ -19,7 +19,6 @@ import re from parakeet.frontend import Vocab import tqdm -import yaml zh_pattern = re.compile("[\u4e00-\u9fa5]") @@ -180,75 +179,3 @@ def split_syllable(syllable: str): phones.append(syllable) tones.append(tone) return phones, tones - - -def load_aishell3_transcription(line: str): - sentence_id, pinyin, text = line.strip().split("|") - syllables = pinyin.strip().split() - - results = [] - - for syllable in syllables: - if syllable in _pauses: - results.append(syllable) - elif not ernized(syllable): - results.append(syllable) - else: - results.append(syllable[:-2] + syllable[-1]) - results.append('&r5') - - phones = [] - tones = [] - for syllable in results: - p, t = split_syllable(syllable) - phones.extend(p) - tones.extend(t) - for p in phones: - assert p in _phones, p - return {"sentence_id": sentence_id, "text": text, "syllables": results, "phones": phones, "tones": tones} - - -def process_aishell3(dataset_root, output_dir): - dataset_root = Path(dataset_root).expanduser() - output_dir = Path(output_dir).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - - prosody_label_path = dataset_root / "label_train-set.txt" - with open(prosody_label_path, 'rt') as f: - lines = [line.strip() for line in f] - - records = lines[5:] - - processed_records = [] - for record in tqdm.tqdm(records): - new_record = load_aishell3_transcription(record) - processed_records.append(new_record) - print(new_record) - - with open(output_dir / "metadata.pickle", 'wb') as f: - pickle.dump(processed_records, f) - - with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f: - yaml.safe_dump(processed_records, f, default_flow_style=None, allow_unicode=True) - - print("metadata done!") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle).") - parser.add_argument( - "--input", - type=str, - default="~/datasets/aishell3/train", - help="path of the training dataset,(contains a label_train-set.txt).") - parser.add_argument( - "--output", - type=str, - help="the directory to save the processed transcription." - "If not provided, it would be the same as the input.") - args = parser.parse_args() - if args.output is None: - args.output = args.input - - process_aishell3(args.input, args.output) diff --git a/modules/audio/voice_cloning/lstm_tacotron2/requirements.txt b/modules/audio/voice_cloning/lstm_tacotron2/requirements.txt index 3c298ee3dc491879e61f0a1da7c8f0c576448b0a..013164d7c3fa849c686cdde69a260f95d83a8e64 100644 --- a/modules/audio/voice_cloning/lstm_tacotron2/requirements.txt +++ b/modules/audio/voice_cloning/lstm_tacotron2/requirements.txt @@ -1,7 +1 @@ -librosa -nltk -pypinyin -scipy -soundfile -webrtcvad -yaml +paddle-parakeet diff --git a/modules/text/text_generation/plato-mini/module.py b/modules/text/text_generation/plato-mini/module.py index 4a3594ef871d5eaf6c5119d07f253cb657f9b6fb..b6ba1be9e4c3033b4ff2e8f1aaac3b7f68f400dd 100644 --- a/modules/text/text_generation/plato-mini/module.py +++ b/modules/text/text_generation/plato-mini/module.py @@ -30,7 +30,7 @@ from plato_mini.utils import select_response name="plato-mini", version="1.0.0", summary="", - author="PaddlePaddle", + author="paddlepaddle", author_email="", type="nlp/text_generation", ) diff --git a/modules/text/text_generation/unified_transformer-12L-cn-luge/module.py b/modules/text/text_generation/unified_transformer-12L-cn-luge/module.py index e6bb87f525eb72d8fe5cabcc915535f22fecbdff..52ef5532db84d696960d4ac28ef1cba4bbfbc75c 100644 --- a/modules/text/text_generation/unified_transformer-12L-cn-luge/module.py +++ b/modules/text/text_generation/unified_transformer-12L-cn-luge/module.py @@ -30,7 +30,7 @@ from unified_transformer_12L_cn_luge.utils import select_response name="unified_transformer_12L_cn_luge", version="1.0.0", summary="", - author="PaddlePaddle", + author="paddlepaddle", author_email="", type="nlp/text_generation", ) diff --git a/modules/text/text_generation/unified_transformer-12L-cn/module.py b/modules/text/text_generation/unified_transformer-12L-cn/module.py index 6292921b432036a8e1d2747d1a0edcb2e2b58a7b..ee09a55d0c2853a7abfcf6a19bc727c1de5c1ad2 100644 --- a/modules/text/text_generation/unified_transformer-12L-cn/module.py +++ b/modules/text/text_generation/unified_transformer-12L-cn/module.py @@ -30,7 +30,7 @@ from unified_transformer_12L_cn.utils import select_response name="unified_transformer_12L_cn", version="1.0.0", summary="", - author="PaddlePaddle", + author="paddlepaddle", author_email="", type="nlp/text_generation", )