diff --git a/examples/csmsc/voc1/run.sh b/examples/csmsc/voc1/run.sh index 409c8bf0f2c77f9dcbdee127ca6eaae611d369f8..16309543948c1a4de048e977639ddde86c4769b2 100755 --- a/examples/csmsc/voc1/run.sh +++ b/examples/csmsc/voc1/run.sh @@ -3,7 +3,7 @@ set -e source path.sh -gpus=4,5 +gpus=0,1 stage=0 stop_stage=100 diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py index ee9fe05792420758b3d0fccbbb0538b4a96c28cc..1839415e978d84b7fbf83f3516f01176e4aabe6c 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py @@ -46,14 +46,14 @@ def evaluate(args, fastspeech2_config, pwg_config): print("vocab_size:", vocab_size) with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) - print("num_speakers:", num_speakers) + spk_num = len(spk_id) + print("spk_num:", spk_num) odim = fastspeech2_config.n_mels model = FastSpeech2( idim=vocab_size, odim=odim, - num_speakers=num_speakers, + spk_num=spk_num, **fastspeech2_config["model"]) model.set_state_dict( diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py index b5d0ce1716b40cc82dca81789b4809fe635f5366..095d20821132cfd0f78485a025d91494f0409b83 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py @@ -51,14 +51,14 @@ def evaluate(args, fastspeech2_config, pwg_config): print("vocab_size:", vocab_size) with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) - print("num_speakers:", num_speakers) + spk_num = len(spk_id) + print("spk_num:", spk_num) odim = fastspeech2_config.n_mels model = FastSpeech2( idim=vocab_size, odim=odim, - num_speakers=num_speakers, + spk_num=spk_num, **fastspeech2_config["model"]) model.set_state_dict( diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize.py b/paddlespeech/t2s/exps/fastspeech2/synthesize.py index c9f36f9ca2a18637fa50ba0aae2fc96d55ebeb87..249845e4dba5b96747a881eeeb2a2299e020969f 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize.py @@ -40,19 +40,19 @@ def evaluate(args, fastspeech2_config, pwg_config): fields = ["utt_id", "text"] - num_speakers = None + spk_num = None if args.speaker_dict is not None: print("multiple speaker fastspeech2!") with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) + spk_num = len(spk_id) fields += ["spk_id"] elif args.voice_cloning: print("voice cloning!") fields += ["spk_emb"] else: print("single speaker fastspeech2!") - print("num_speakers:", num_speakers) + print("spk_num:", spk_num) test_dataset = DataTable(data=test_metadata, fields=fields) @@ -65,7 +65,7 @@ def evaluate(args, fastspeech2_config, pwg_config): model = FastSpeech2( idim=vocab_size, odim=odim, - num_speakers=num_speakers, + spk_num=spk_num, **fastspeech2_config["model"]) model.set_state_dict( diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index 6f42a71e469fc2d70f4b5ced1da03cbfcc2c100a..fafded6fca0cef090d9ba1c844cdc526fe5d49d6 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -62,13 +62,13 @@ def train_sp(args, config): "pitch", "energy" ] converters = {"speech": np.load, "pitch": np.load, "energy": np.load} - num_speakers = None + spk_num = None if args.speaker_dict is not None: print("multiple speaker fastspeech2!") collate_fn = fastspeech2_multi_spk_batch_fn with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) + spk_num = len(spk_id) fields += ["spk_id"] elif args.voice_cloning: print("Training voice cloning!") @@ -78,7 +78,7 @@ def train_sp(args, config): else: print("single speaker fastspeech2!") collate_fn = fastspeech2_single_spk_batch_fn - print("num_speakers:", num_speakers) + print("spk_num:", spk_num) # dataloader has been too verbose logging.getLogger("DataLoader").disabled = True @@ -129,10 +129,7 @@ def train_sp(args, config): odim = config.n_mels model = FastSpeech2( - idim=vocab_size, - odim=odim, - num_speakers=num_speakers, - **config["model"]) + idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"]) if world_size > 1: model = DataParallel(model) print("model done!") diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index d0084f721af183484ebea2ca0befb1ee6f324965..4ae6f8ace6ad19c56caa261db244d1e6ef91169a 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -96,7 +96,7 @@ class FastSpeech2(nn.Layer): pitch_embed_dropout: float=0.5, stop_gradient_from_pitch_predictor: bool=False, # spk emb - num_speakers: int=None, + spk_num: int=None, spk_embed_dim: int=None, spk_embed_integration_type: str="add", # tone emb @@ -146,9 +146,9 @@ class FastSpeech2(nn.Layer): # initialize parameters initialize(self, init_type) - if self.spk_embed_dim and num_speakers: + if spk_num and self.spk_embed_dim: self.spk_embedding_table = nn.Embedding( - num_embeddings=num_speakers, + num_embeddings=spk_num, embedding_dim=self.spk_embed_dim, padding_idx=self.padding_idx)