diff --git a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py index 0c2bb02d64d97104d2ce0c79a32570bb8f5d2277..b6440fd6f27611188cfca95fed7c2c7deac59b1c 100644 --- a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py +++ b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py @@ -15,21 +15,22 @@ # for mb melgan finetune # 长度和原本的 mel 不一致怎么办? import argparse +import os from pathlib import Path import numpy as np import paddle import yaml -from yacs.config import CfgNode from tqdm import tqdm -import os +from yacs.config import CfgNode from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import merge_silence +from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.models.speedyspeech import SpeedySpeech from paddlespeech.t2s.models.speedyspeech import SpeedySpeechInference from paddlespeech.t2s.modules.normalizer import ZScore -from paddlespeech.t2s.frontend.zh_frontend import Frontend + def evaluate(args, speedyspeech_config): rootdir = Path(args.rootdir).expanduser() @@ -50,17 +51,21 @@ def evaluate(args, speedyspeech_config): tone_size = len(tone_id) print("tone_size:", tone_size) - frontend = Frontend(phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict) + frontend = Frontend( + phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict) if args.speaker_dict: with open(args.speaker_dict, 'rt') as f: spk_id_list = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id_list) else: - spk_num=None + spk_num = None model = SpeedySpeech( - vocab_size=vocab_size, tone_size=tone_size, **speedyspeech_config["model"], spk_num=spk_num) + vocab_size=vocab_size, + tone_size=tone_size, + **speedyspeech_config["model"], + spk_num=spk_num) model.set_state_dict( paddle.load(args.speedyspeech_checkpoint)["main_params"]) @@ -105,9 +110,15 @@ def evaluate(args, speedyspeech_config): else: train_wav_files += wav_files - train_wav_files = [os.path.basename(str(str_path)) for str_path in train_wav_files] - dev_wav_files = [os.path.basename(str(str_path)) for str_path in dev_wav_files] - test_wav_files = [os.path.basename(str(str_path)) for str_path in test_wav_files] + train_wav_files = [ + os.path.basename(str(str_path)) for str_path in train_wav_files + ] + dev_wav_files = [ + os.path.basename(str(str_path)) for str_path in dev_wav_files + ] + test_wav_files = [ + os.path.basename(str(str_path)) for str_path in test_wav_files + ] for i, utt_id in enumerate(tqdm(sentences)): phones = sentences[utt_id][0] @@ -122,8 +133,7 @@ def evaluate(args, speedyspeech_config): durations = durations[:-1] phones = phones[:-1] - phones, tones = frontend._get_phone_tone( - phones, get_tone_ids=True) + phones, tones = frontend._get_phone_tone(phones, get_tone_ids=True) if tones: tone_ids = frontend._t2id(tones) tone_ids = paddle.to_tensor(tone_ids) @@ -132,7 +142,8 @@ def evaluate(args, speedyspeech_config): phone_ids = paddle.to_tensor(phone_ids) if args.speaker_dict: - speaker_id = int([item[1] for item in spk_id_list if speaker == item[0]][0]) + speaker_id = int( + [item[1] for item in spk_id_list if speaker == item[0]][0]) speaker_id = paddle.to_tensor(speaker_id) else: speaker_id = None @@ -155,7 +166,8 @@ def evaluate(args, speedyspeech_config): sub_output_dir.mkdir(parents=True, exist_ok=True) with paddle.no_grad(): - mel = speedyspeech_inference(phone_ids, tone_ids, durations=durations, spk_id=speaker_id) + mel = speedyspeech_inference( + phone_ids, tone_ids, durations=durations, spk_id=speaker_id) np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) @@ -193,10 +205,7 @@ def main(): default="tone_id_map.txt", help="tone vocabulary file.") parser.add_argument( - "--speaker-dict", - type=str, - default=None, - help="speaker id map file.") + "--speaker-dict", type=str, default=None, help="speaker id map file.") parser.add_argument( "--dur-file", default=None, type=str, help="path to durations.txt.") diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index 263b4c6b93a5ebe180b81a8c88a1c6518dc924c0..acddd976f52000a58fd1b59f4994032826771e34 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -272,9 +272,6 @@ class SpeedySpeechInference(nn.Layer): def forward(self, phones, tones, durations=None, spk_id=None): normalized_mel = self.acoustic_model.inference( - phones, - tones, - durations=durations, - spk_id=spk_id) + phones, tones, durations=durations, spk_id=spk_id) logmel = self.normalizer.inverse(normalized_mel) - return logmel \ No newline at end of file + return logmel diff --git a/utils/link_wav.py b/utils/link_wav.py index a9ad4136c07b2cf7f3bdb35450d0015c6c9777f5..8fe2156b273a0c9c56a96f9d6d22e00d5f160d98 100644 --- a/utils/link_wav.py +++ b/utils/link_wav.py @@ -20,6 +20,7 @@ import jsonlines import numpy as np from tqdm import tqdm + def main(): # parse config and args parser = argparse.ArgumentParser( @@ -61,9 +62,10 @@ def main(): try: wav = np.load(old_dump_dir / sub / ("raw/" + wave_name)) os.symlink(old_dump_dir / sub / ("raw/" + wave_name), - output_dir / ("raw/" + wave_name)) + output_dir / ("raw/" + wave_name)) except FileNotFoundError: - print("delete " + name + " because it cannot be found in the dump folder") + print("delete " + name + + " because it cannot be found in the dump folder") os.remove(output_dir / "raw" / name) continue except FileExistsError: