diff --git a/examples/csmsc/vits/conf/default.yaml b/examples/csmsc/vits/conf/default.yaml index 47af780dc656533c147380b4b8b92ccf3a616076..32f995cc9489359bc91bb951442c5fde78286724 100644 --- a/examples/csmsc/vits/conf/default.yaml +++ b/examples/csmsc/vits/conf/default.yaml @@ -178,6 +178,8 @@ generator_first: False # whether to start updating generator first ########################################################## # OTHER TRAINING SETTING # ########################################################## -max_epoch: 1000 # number of epochs -num_snapshots: 10 # max number of snapshots to keep while training -seed: 777 # random seed number +num_snapshots: 10 # max number of snapshots to keep while training +train_max_steps: 250000 # Number of training steps. == total_iters / ngpus, total_iters = 1000000 +save_interval_steps: 1000 # Interval steps to save checkpoint. +eval_interval_steps: 250 # Interval steps to evaluate the network. +seed: 777 # random seed number diff --git a/examples/csmsc/vits/local/preprocess.sh b/examples/csmsc/vits/local/preprocess.sh index 1d3ae59376499add5ef8479499254beada6df642..1cd6d1f9b0c0fa3f47b088195bf76c0d6d08f48b 100755 --- a/examples/csmsc/vits/local/preprocess.sh +++ b/examples/csmsc/vits/local/preprocess.sh @@ -4,6 +4,7 @@ stage=0 stop_stage=100 config_path=$1 +add_blank=$2 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -44,6 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --feats-stats=dump/train/feats_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt \ + --add-blank=${add_blank} \ --skip-wav-copy python3 ${BIN_DIR}/normalize.py \ @@ -52,6 +54,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --feats-stats=dump/train/feats_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt \ + --add-blank=${add_blank} \ --skip-wav-copy python3 ${BIN_DIR}/normalize.py \ @@ -60,5 +63,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --feats-stats=dump/train/feats_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt \ + --add-blank=${add_blank} \ --skip-wav-copy fi diff --git a/examples/csmsc/vits/local/synthesize_e2e.sh b/examples/csmsc/vits/local/synthesize_e2e.sh index edbb07bfc803ccd558477977143bdbe53280fc62..3f3bf6517a415020961eeef70ace4921d8062ee9 100755 --- a/examples/csmsc/vits/local/synthesize_e2e.sh +++ b/examples/csmsc/vits/local/synthesize_e2e.sh @@ -3,9 +3,12 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 +add_blank=$4 + stage=0 stop_stage=0 + if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ @@ -14,5 +17,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --phones_dict=dump/phone_id_map.txt \ --output_dir=${train_output_path}/test_e2e \ - --text=${BIN_DIR}/../sentences.txt + --text=${BIN_DIR}/../sentences.txt \ + --add-blank=${add_blank} fi diff --git a/examples/csmsc/vits/run.sh b/examples/csmsc/vits/run.sh index 80e56e7c146ae02b6b7b9ff23159cb746a8088b6..c284b7b238cfc528277909f297d7cbb10a273299 100755 --- a/examples/csmsc/vits/run.sh +++ b/examples/csmsc/vits/run.sh @@ -10,6 +10,7 @@ stop_stage=100 conf_path=conf/default.yaml train_output_path=exp/default ckpt_name=snapshot_iter_153.pdz +add_blank=true # with the following command, you can choose the stage range you want to run # such as `./run.sh --stage 0 --stop-stage 0` @@ -18,7 +19,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data - ./local/preprocess.sh ${conf_path} || exit -1 + ./local/preprocess.sh ${conf_path} ${add_blank}|| exit -1 fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then @@ -32,5 +33,5 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # synthesize_e2e, vocoder is pwgan - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ${add_blank}|| exit -1 fi diff --git a/examples/ljspeech/voc0/local/synthesize.sh b/examples/ljspeech/voc0/local/synthesize.sh index 1d5e11836aa3647cc9d93d86c25403cfb37d5a39..11874e4991ba7cd45a6ac356086a31707b4109f3 100755 --- a/examples/ljspeech/voc0/local/synthesize.sh +++ b/examples/ljspeech/voc0/local/synthesize.sh @@ -8,5 +8,4 @@ python ${BIN_DIR}/synthesize.py \ --input=${input_mel_path} \ --output=${train_output_path}/wavs/ \ --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ - --ngpu=1 \ - --verbose \ No newline at end of file + --ngpu=1 \ No newline at end of file diff --git a/paddlespeech/t2s/exps/fastspeech2/normalize.py b/paddlespeech/t2s/exps/fastspeech2/normalize.py index 8ec20ebf0f8f1865c45cdeed99d487e079e498b0..92d10832b731856e885b11ebf038dd91da42bd9c 100644 --- a/paddlespeech/t2s/exps/fastspeech2/normalize.py +++ b/paddlespeech/t2s/exps/fastspeech2/normalize.py @@ -58,30 +58,8 @@ def main(): "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( "--speaker-dict", type=str, default=None, help="speaker id map file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") - args = parser.parse_args() - # set logger - if args.verbose > 1: - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - elif args.verbose > 0: - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - else: - logging.basicConfig( - level=logging.WARN, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - logging.warning('Skip DEBUG/INFO messages') + args = parser.parse_args() dumpdir = Path(args.dumpdir).expanduser() # use absolute path diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index eac75f9821dd69b798a097687a1101b8d717dc9c..0045c5a3319f2eeba7956b57d64d64cb6569d181 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -209,11 +209,6 @@ def main(): parser.add_argument("--config", type=str, help="fastspeech2 config file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") @@ -248,10 +243,6 @@ def main(): with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) - if args.verbose > 1: - print(vars(args)) - print(config) - sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) diff --git a/paddlespeech/t2s/exps/gan_vocoder/normalize.py b/paddlespeech/t2s/exps/gan_vocoder/normalize.py index ba95d3ed61e341ebc458846a1f79099066c2cc7a..4cb7e41c576935e6d349eba7efe19914babded4d 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/normalize.py +++ b/paddlespeech/t2s/exps/gan_vocoder/normalize.py @@ -47,30 +47,8 @@ def main(): default=False, action="store_true", help="whether to skip the copy of wav files.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") - args = parser.parse_args() - # set logger - if args.verbose > 1: - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - elif args.verbose > 0: - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - else: - logging.basicConfig( - level=logging.WARN, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - logging.warning('Skip DEBUG/INFO messages') + args = parser.parse_args() dumpdir = Path(args.dumpdir).expanduser() # use absolute path diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index 546367964f98205318b1ad089604d2518472506e..05c6576829cff710c50235015a07c66781d381e4 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -167,11 +167,6 @@ def main(): required=True, help="directory to dump feature files.") parser.add_argument("--config", type=str, help="vocoder config file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") parser.add_argument( @@ -197,10 +192,6 @@ def main(): with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) - if args.verbose > 1: - print(vars(args)) - print(config) - sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) diff --git a/paddlespeech/t2s/exps/speedyspeech/normalize.py b/paddlespeech/t2s/exps/speedyspeech/normalize.py index 249a4d6d83e59c933994a1532d0e836a0a8679c3..f29466f655ee7033654e8366095831cff0a18657 100644 --- a/paddlespeech/t2s/exps/speedyspeech/normalize.py +++ b/paddlespeech/t2s/exps/speedyspeech/normalize.py @@ -50,11 +50,6 @@ def main(): "--tones-dict", type=str, default=None, help="tone vocabulary file.") parser.add_argument( "--speaker-dict", type=str, default=None, help="speaker id map file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--use-relative-path", @@ -63,24 +58,6 @@ def main(): help="whether use relative path in metadata") args = parser.parse_args() - # set logger - if args.verbose > 1: - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - elif args.verbose > 0: - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - else: - logging.basicConfig( - level=logging.WARN, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - logging.warning('Skip DEBUG/INFO messages') - dumpdir = Path(args.dumpdir).expanduser() # use absolute path dumpdir = dumpdir.resolve() diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py index aa7608d6b945b7fda3bdfab9ab74c1c080b20537..e4084c142f6cf00791930c6de1a11078d32a26c0 100644 --- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py +++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py @@ -195,11 +195,6 @@ def main(): parser.add_argument("--config", type=str, help="fastspeech2 config file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") @@ -230,10 +225,6 @@ def main(): with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) - if args.verbose > 1: - print(vars(args)) - print(config) - sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py index 6137da7f175b4e23af7a4b2e60908527bb65978d..c27b9769b7daa0de8857a27129e4f52dafee717f 100644 --- a/paddlespeech/t2s/exps/tacotron2/preprocess.py +++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py @@ -184,11 +184,6 @@ def main(): parser.add_argument("--config", type=str, help="fastspeech2 config file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") @@ -223,10 +218,6 @@ def main(): with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) - if args.verbose > 1: - print(vars(args)) - print(config) - sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) diff --git a/paddlespeech/t2s/exps/transformer_tts/normalize.py b/paddlespeech/t2s/exps/transformer_tts/normalize.py index 87e975b88ffb1b27c63885dfbe7fdb3c4cf5b718..e5f052c60dbbb84da35731a0eefb4a0c721b06be 100644 --- a/paddlespeech/t2s/exps/transformer_tts/normalize.py +++ b/paddlespeech/t2s/exps/transformer_tts/normalize.py @@ -51,30 +51,8 @@ def main(): "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( "--speaker-dict", type=str, default=None, help="speaker id map file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") - args = parser.parse_args() - # set logger - if args.verbose > 1: - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - elif args.verbose > 0: - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - else: - logging.basicConfig( - level=logging.WARN, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - logging.warning('Skip DEBUG/INFO messages') + args = parser.parse_args() # check directory existence dumpdir = Path(args.dumpdir).resolve() diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py index 28ca3de6eb455a3832029eb44e55e804b0fd8826..2ebd5ecc2fdbc0ebd69203b779b71809e9fad8c9 100644 --- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py +++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py @@ -186,11 +186,6 @@ def main(): type=str, help="yaml format configuration file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") @@ -210,10 +205,6 @@ def main(): _C = Configuration(_C) config = _C.clone() - if args.verbose > 1: - print(vars(args)) - print(config) - phone_id_map_path = dumpdir / "phone_id_map.txt" speaker_id_map_path = dumpdir / "speaker_id_map.txt" diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py index 6fc8adb061a94acbaee6d960c87b9c1a5de41644..5881ae95c071255a583bda87869dfafd9cac2809 100644 --- a/paddlespeech/t2s/exps/vits/normalize.py +++ b/paddlespeech/t2s/exps/vits/normalize.py @@ -16,6 +16,7 @@ import argparse import logging from operator import itemgetter from pathlib import Path +from typing import List import jsonlines import numpy as np @@ -23,6 +24,50 @@ from sklearn.preprocessing import StandardScaler from tqdm import tqdm from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.utils import str2bool + +INITIALS = [ + 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh', + 'r', 'z', 'c', 's', 'j', 'q', 'x' +] +INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil'] + + +def intersperse(lst, item): + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + + +def insert_after_character(lst, item): + result = [item] + for phone in lst: + result.append(phone) + if phone not in INITIALS: + # finals has tones + assert phone[-1] in "12345" + result.append(item) + return result + + +def add_blank(phones: List[str], + filed: str="character", + blank_token: str=""): + if filed == "phone": + """ + add blank after phones + input: ["n", "i3", "h", "ao3", "m", "a5"] + output: ["n", "", "i3", "", "h", "", "ao3", "", "m", "", "a5"] + """ + phones = intersperse(phones, blank_token) + elif filed == "character": + """ + add blank after characters + input: ["n", "i3", "h", "ao3"] + output: ["n", "i3", "", "h", "ao3", "", "m", "a5"] + """ + phones = insert_after_character(phones, blank_token) + return phones def main(): @@ -58,29 +103,12 @@ def main(): parser.add_argument( "--speaker-dict", type=str, default=None, help="speaker id map file.") parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") - args = parser.parse_args() + "--add-blank", + type=str2bool, + default=True, + help="whether to add blank between phones") - # set logger - if args.verbose > 1: - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - elif args.verbose > 0: - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - else: - logging.basicConfig( - level=logging.WARN, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - logging.warning('Skip DEBUG/INFO messages') + args = parser.parse_args() dumpdir = Path(args.dumpdir).expanduser() # use absolute path @@ -135,13 +163,19 @@ def main(): else: wav_path = wave - phone_ids = [vocab_phones[p] for p in item['phones']] + phones = item['phones'] + text_lengths = item['text_lengths'] + if args.add_blank: + phones = add_blank(phones, filed="character") + text_lengths = len(phones) + + phone_ids = [vocab_phones[p] for p in phones] spk_id = vocab_speaker[item["speaker"]] record = { "utt_id": item['utt_id'], "text": phone_ids, - "text_lengths": item['text_lengths'], + "text_lengths": text_lengths, 'feats': str(feats_path), "feats_lengths": item['feats_lengths'], "wave": str(wav_path), diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py index 6aa139fb5f8a32f965bd7a2b781500a011b344ff..f89ab356f1a3cf43747e80d51f13a60bbd9b3445 100644 --- a/paddlespeech/t2s/exps/vits/preprocess.py +++ b/paddlespeech/t2s/exps/vits/preprocess.py @@ -197,11 +197,6 @@ def main(): parser.add_argument("--config", type=str, help="fastspeech2 config file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") @@ -236,10 +231,6 @@ def main(): with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) - if args.verbose > 1: - print(vars(args)) - print(config) - sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) diff --git a/paddlespeech/t2s/exps/vits/synthesize_e2e.py b/paddlespeech/t2s/exps/vits/synthesize_e2e.py index c82e5c03941288eee19d1b8f063105288daa8a12..33a4137519ace905733db815955f834a199e17cf 100644 --- a/paddlespeech/t2s/exps/vits/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/vits/synthesize_e2e.py @@ -23,6 +23,7 @@ from yacs.config import CfgNode from paddlespeech.t2s.exps.syn_utils import get_frontend from paddlespeech.t2s.exps.syn_utils import get_sentences from paddlespeech.t2s.models.vits import VITS +from paddlespeech.t2s.utils import str2bool def evaluate(args): @@ -55,6 +56,7 @@ def evaluate(args): output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) merge_sentences = False + add_blank = args.add_blank N = 0 T = 0 @@ -62,7 +64,9 @@ def evaluate(args): with timer() as t: if args.lang == 'zh': input_ids = frontend.get_input_ids( - sentence, merge_sentences=merge_sentences) + sentence, + merge_sentences=merge_sentences, + add_blank=add_blank) phone_ids = input_ids["phone_ids"] elif args.lang == 'en': input_ids = frontend.get_input_ids( @@ -125,6 +129,12 @@ def parse_args(): help="text to synthesize, a 'utt_id sentence' pair per line.") parser.add_argument("--output_dir", type=str, help="output dir.") + parser.add_argument( + "--add-blank", + type=str2bool, + default=True, + help="whether to add blank between phones") + args = parser.parse_args() return args diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index dbda8b7177bca068ecaeabe41679a93e153aba35..1a68d13269ab1ed54a176106f3f31edd8fab3e97 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -211,13 +211,18 @@ def train_sp(args, config): generator_first=config.generator_first, output_dir=output_dir) - trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) + trainer = Trainer( + updater, + stop_trigger=(config.train_max_steps, "iteration"), + out=output_dir) if dist.get_rank() == 0: - trainer.extend(evaluator, trigger=(1, "epoch")) - trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) + trainer.extend( + evaluator, trigger=(config.eval_interval_steps, 'iteration')) + trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) trainer.extend( - Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) print("Trainer Done!") trainer.run() diff --git a/paddlespeech/t2s/exps/waveflow/preprocess.py b/paddlespeech/t2s/exps/waveflow/preprocess.py index ef3a29175896d7d02f7a9df4dcc930d33f9476af..c7034aeabf8987441749956a309771f67040dfc9 100644 --- a/paddlespeech/t2s/exps/waveflow/preprocess.py +++ b/paddlespeech/t2s/exps/waveflow/preprocess.py @@ -143,8 +143,6 @@ if __name__ == "__main__": nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" ) - parser.add_argument( - "-v", "--verbose", action="store_true", help="print msg") config = get_cfg_defaults() args = parser.parse_args() @@ -153,8 +151,5 @@ if __name__ == "__main__": if args.opts: config.merge_from_list(args.opts) config.freeze() - if args.verbose: - print(config.data) - print(args) create_dataset(config.data, args.input, args.output) diff --git a/paddlespeech/t2s/exps/waveflow/synthesize.py b/paddlespeech/t2s/exps/waveflow/synthesize.py index 53715b01ea0f89fd7cf19f18c4643e07f28d0422..a3190c6e52c12e98b9e2873a0859ac3dc221a459 100644 --- a/paddlespeech/t2s/exps/waveflow/synthesize.py +++ b/paddlespeech/t2s/exps/waveflow/synthesize.py @@ -72,8 +72,6 @@ if __name__ == "__main__": nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" ) - parser.add_argument( - "-v", "--verbose", action="store_true", help="print msg") args = parser.parse_args() if args.config: diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 129aa944ed3ea1c7bb52a400101cf88c34be4578..143ccbc15d44623acdac8f5a0810b480af5d614a 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -29,6 +29,29 @@ from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer +INITIALS = [ + 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh', + 'r', 'z', 'c', 's', 'j', 'q', 'x' +] +INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil'] + + +def intersperse(lst, item): + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + + +def insert_after_character(lst, item): + result = [item] + for phone in lst: + result.append(phone) + if phone not in INITIALS: + # finals has tones + # assert phone[-1] in "12345" + result.append(item) + return result + class Frontend(): def __init__(self, @@ -280,12 +303,15 @@ class Frontend(): print("----------------------------") return phonemes - def get_input_ids(self, - sentence: str, - merge_sentences: bool=True, - get_tone_ids: bool=False, - robot: bool=False, - print_info: bool=False) -> Dict[str, List[paddle.Tensor]]: + def get_input_ids( + self, + sentence: str, + merge_sentences: bool=True, + get_tone_ids: bool=False, + robot: bool=False, + print_info: bool=False, + add_blank: bool=False, + blank_token: str="") -> Dict[str, List[paddle.Tensor]]: phonemes = self.get_phonemes( sentence, merge_sentences=merge_sentences, @@ -299,6 +325,10 @@ class Frontend(): for part_phonemes in phonemes: phones, tones = self._get_phone_tone( part_phonemes, get_tone_ids=get_tone_ids) + + if add_blank: + phones = insert_after_character(phones, blank_token) + if tones: tone_ids = self._t2id(tones) tone_ids = paddle.to_tensor(tone_ids) diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py index ab8eda26d0b9b2118fa2b06b6f9ea546abb74873..5c476be77d747dc2fb2a6879640fed1258c6bcc8 100644 --- a/paddlespeech/t2s/models/vits/vits.py +++ b/paddlespeech/t2s/models/vits/vits.py @@ -227,11 +227,7 @@ class VITS(nn.Layer): lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). forward_generator (bool): Whether to forward generator. Returns: - Dict[str, Any]: - - loss (Tensor): Loss scalar tensor. - - stats (Dict[str, float]): Statistics to be monitored. - - weight (Tensor): Weight tensor to summarize losses. - - optim_idx (int): Optimizer index (0 for G and 1 for D). + """ if forward_generator: return self._forward_generator(