From 1731976e4ebc1e73f9a4cca2f20643bb8897b591 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 16 Jun 2022 07:15:55 +0000 Subject: [PATCH] add blank between characters for vits, test=tts --- examples/csmsc/vits/conf/default.yaml | 8 +- examples/csmsc/vits/local/preprocess.sh | 4 + examples/csmsc/vits/local/synthesize_e2e.sh | 6 +- examples/csmsc/vits/run.sh | 5 +- examples/ljspeech/voc0/local/synthesize.sh | 3 +- .../t2s/exps/fastspeech2/normalize.py | 24 +----- .../t2s/exps/fastspeech2/preprocess.py | 9 -- .../t2s/exps/gan_vocoder/normalize.py | 24 +----- .../t2s/exps/gan_vocoder/preprocess.py | 9 -- .../t2s/exps/speedyspeech/normalize.py | 23 ------ .../t2s/exps/speedyspeech/preprocess.py | 9 -- paddlespeech/t2s/exps/tacotron2/preprocess.py | 9 -- .../t2s/exps/transformer_tts/normalize.py | 24 +----- .../t2s/exps/transformer_tts/preprocess.py | 9 -- paddlespeech/t2s/exps/vits/normalize.py | 82 +++++++++++++------ paddlespeech/t2s/exps/vits/preprocess.py | 9 -- paddlespeech/t2s/exps/vits/synthesize_e2e.py | 12 ++- paddlespeech/t2s/exps/vits/train.py | 13 ++- paddlespeech/t2s/exps/waveflow/preprocess.py | 5 -- paddlespeech/t2s/exps/waveflow/synthesize.py | 2 - paddlespeech/t2s/frontend/zh_frontend.py | 42 ++++++++-- paddlespeech/t2s/models/vits/vits.py | 6 +- 22 files changed, 136 insertions(+), 201 deletions(-) diff --git a/examples/csmsc/vits/conf/default.yaml b/examples/csmsc/vits/conf/default.yaml index 47af780d..32f995cc 100644 --- a/examples/csmsc/vits/conf/default.yaml +++ b/examples/csmsc/vits/conf/default.yaml @@ -178,6 +178,8 @@ generator_first: False # whether to start updating generator first ########################################################## # OTHER TRAINING SETTING # ########################################################## -max_epoch: 1000 # number of epochs -num_snapshots: 10 # max number of snapshots to keep while training -seed: 777 # random seed number +num_snapshots: 10 # max number of snapshots to keep while training +train_max_steps: 250000 # Number of training steps. == total_iters / ngpus, total_iters = 1000000 +save_interval_steps: 1000 # Interval steps to save checkpoint. +eval_interval_steps: 250 # Interval steps to evaluate the network. +seed: 777 # random seed number diff --git a/examples/csmsc/vits/local/preprocess.sh b/examples/csmsc/vits/local/preprocess.sh index 1d3ae593..1cd6d1f9 100755 --- a/examples/csmsc/vits/local/preprocess.sh +++ b/examples/csmsc/vits/local/preprocess.sh @@ -4,6 +4,7 @@ stage=0 stop_stage=100 config_path=$1 +add_blank=$2 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -44,6 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --feats-stats=dump/train/feats_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt \ + --add-blank=${add_blank} \ --skip-wav-copy python3 ${BIN_DIR}/normalize.py \ @@ -52,6 +54,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --feats-stats=dump/train/feats_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt \ + --add-blank=${add_blank} \ --skip-wav-copy python3 ${BIN_DIR}/normalize.py \ @@ -60,5 +63,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --feats-stats=dump/train/feats_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt \ + --add-blank=${add_blank} \ --skip-wav-copy fi diff --git a/examples/csmsc/vits/local/synthesize_e2e.sh b/examples/csmsc/vits/local/synthesize_e2e.sh index edbb07bf..3f3bf651 100755 --- a/examples/csmsc/vits/local/synthesize_e2e.sh +++ b/examples/csmsc/vits/local/synthesize_e2e.sh @@ -3,9 +3,12 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 +add_blank=$4 + stage=0 stop_stage=0 + if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ @@ -14,5 +17,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --phones_dict=dump/phone_id_map.txt \ --output_dir=${train_output_path}/test_e2e \ - --text=${BIN_DIR}/../sentences.txt + --text=${BIN_DIR}/../sentences.txt \ + --add-blank=${add_blank} fi diff --git a/examples/csmsc/vits/run.sh b/examples/csmsc/vits/run.sh index 80e56e7c..c284b7b2 100755 --- a/examples/csmsc/vits/run.sh +++ b/examples/csmsc/vits/run.sh @@ -10,6 +10,7 @@ stop_stage=100 conf_path=conf/default.yaml train_output_path=exp/default ckpt_name=snapshot_iter_153.pdz +add_blank=true # with the following command, you can choose the stage range you want to run # such as `./run.sh --stage 0 --stop-stage 0` @@ -18,7 +19,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data - ./local/preprocess.sh ${conf_path} || exit -1 + ./local/preprocess.sh ${conf_path} ${add_blank}|| exit -1 fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then @@ -32,5 +33,5 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # synthesize_e2e, vocoder is pwgan - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ${add_blank}|| exit -1 fi diff --git a/examples/ljspeech/voc0/local/synthesize.sh b/examples/ljspeech/voc0/local/synthesize.sh index 1d5e1183..11874e49 100755 --- a/examples/ljspeech/voc0/local/synthesize.sh +++ b/examples/ljspeech/voc0/local/synthesize.sh @@ -8,5 +8,4 @@ python ${BIN_DIR}/synthesize.py \ --input=${input_mel_path} \ --output=${train_output_path}/wavs/ \ --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ - --ngpu=1 \ - --verbose \ No newline at end of file + --ngpu=1 \ No newline at end of file diff --git a/paddlespeech/t2s/exps/fastspeech2/normalize.py b/paddlespeech/t2s/exps/fastspeech2/normalize.py index 8ec20ebf..92d10832 100644 --- a/paddlespeech/t2s/exps/fastspeech2/normalize.py +++ b/paddlespeech/t2s/exps/fastspeech2/normalize.py @@ -58,30 +58,8 @@ def main(): "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( "--speaker-dict", type=str, default=None, help="speaker id map file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") - args = parser.parse_args() - # set logger - if args.verbose > 1: - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - elif args.verbose > 0: - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - else: - logging.basicConfig( - level=logging.WARN, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - logging.warning('Skip DEBUG/INFO messages') + args = parser.parse_args() dumpdir = Path(args.dumpdir).expanduser() # use absolute path diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index eac75f98..0045c5a3 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -209,11 +209,6 @@ def main(): parser.add_argument("--config", type=str, help="fastspeech2 config file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") @@ -248,10 +243,6 @@ def main(): with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) - if args.verbose > 1: - print(vars(args)) - print(config) - sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) diff --git a/paddlespeech/t2s/exps/gan_vocoder/normalize.py b/paddlespeech/t2s/exps/gan_vocoder/normalize.py index ba95d3ed..4cb7e41c 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/normalize.py +++ b/paddlespeech/t2s/exps/gan_vocoder/normalize.py @@ -47,30 +47,8 @@ def main(): default=False, action="store_true", help="whether to skip the copy of wav files.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") - args = parser.parse_args() - # set logger - if args.verbose > 1: - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - elif args.verbose > 0: - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - else: - logging.basicConfig( - level=logging.WARN, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - logging.warning('Skip DEBUG/INFO messages') + args = parser.parse_args() dumpdir = Path(args.dumpdir).expanduser() # use absolute path diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index 54636796..05c65768 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -167,11 +167,6 @@ def main(): required=True, help="directory to dump feature files.") parser.add_argument("--config", type=str, help="vocoder config file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") parser.add_argument( @@ -197,10 +192,6 @@ def main(): with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) - if args.verbose > 1: - print(vars(args)) - print(config) - sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) diff --git a/paddlespeech/t2s/exps/speedyspeech/normalize.py b/paddlespeech/t2s/exps/speedyspeech/normalize.py index 249a4d6d..f29466f6 100644 --- a/paddlespeech/t2s/exps/speedyspeech/normalize.py +++ b/paddlespeech/t2s/exps/speedyspeech/normalize.py @@ -50,11 +50,6 @@ def main(): "--tones-dict", type=str, default=None, help="tone vocabulary file.") parser.add_argument( "--speaker-dict", type=str, default=None, help="speaker id map file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--use-relative-path", @@ -63,24 +58,6 @@ def main(): help="whether use relative path in metadata") args = parser.parse_args() - # set logger - if args.verbose > 1: - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - elif args.verbose > 0: - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - else: - logging.basicConfig( - level=logging.WARN, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - logging.warning('Skip DEBUG/INFO messages') - dumpdir = Path(args.dumpdir).expanduser() # use absolute path dumpdir = dumpdir.resolve() diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py index aa7608d6..e4084c14 100644 --- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py +++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py @@ -195,11 +195,6 @@ def main(): parser.add_argument("--config", type=str, help="fastspeech2 config file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") @@ -230,10 +225,6 @@ def main(): with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) - if args.verbose > 1: - print(vars(args)) - print(config) - sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py index 6137da7f..c27b9769 100644 --- a/paddlespeech/t2s/exps/tacotron2/preprocess.py +++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py @@ -184,11 +184,6 @@ def main(): parser.add_argument("--config", type=str, help="fastspeech2 config file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") @@ -223,10 +218,6 @@ def main(): with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) - if args.verbose > 1: - print(vars(args)) - print(config) - sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) diff --git a/paddlespeech/t2s/exps/transformer_tts/normalize.py b/paddlespeech/t2s/exps/transformer_tts/normalize.py index 87e975b8..e5f052c6 100644 --- a/paddlespeech/t2s/exps/transformer_tts/normalize.py +++ b/paddlespeech/t2s/exps/transformer_tts/normalize.py @@ -51,30 +51,8 @@ def main(): "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( "--speaker-dict", type=str, default=None, help="speaker id map file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") - args = parser.parse_args() - # set logger - if args.verbose > 1: - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - elif args.verbose > 0: - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - else: - logging.basicConfig( - level=logging.WARN, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - logging.warning('Skip DEBUG/INFO messages') + args = parser.parse_args() # check directory existence dumpdir = Path(args.dumpdir).resolve() diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py index 28ca3de6..2ebd5ecc 100644 --- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py +++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py @@ -186,11 +186,6 @@ def main(): type=str, help="yaml format configuration file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") @@ -210,10 +205,6 @@ def main(): _C = Configuration(_C) config = _C.clone() - if args.verbose > 1: - print(vars(args)) - print(config) - phone_id_map_path = dumpdir / "phone_id_map.txt" speaker_id_map_path = dumpdir / "speaker_id_map.txt" diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py index 6fc8adb0..5881ae95 100644 --- a/paddlespeech/t2s/exps/vits/normalize.py +++ b/paddlespeech/t2s/exps/vits/normalize.py @@ -16,6 +16,7 @@ import argparse import logging from operator import itemgetter from pathlib import Path +from typing import List import jsonlines import numpy as np @@ -23,6 +24,50 @@ from sklearn.preprocessing import StandardScaler from tqdm import tqdm from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.utils import str2bool + +INITIALS = [ + 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh', + 'r', 'z', 'c', 's', 'j', 'q', 'x' +] +INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil'] + + +def intersperse(lst, item): + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + + +def insert_after_character(lst, item): + result = [item] + for phone in lst: + result.append(phone) + if phone not in INITIALS: + # finals has tones + assert phone[-1] in "12345" + result.append(item) + return result + + +def add_blank(phones: List[str], + filed: str="character", + blank_token: str=""): + if filed == "phone": + """ + add blank after phones + input: ["n", "i3", "h", "ao3", "m", "a5"] + output: ["n", "", "i3", "", "h", "", "ao3", "", "m", "", "a5"] + """ + phones = intersperse(phones, blank_token) + elif filed == "character": + """ + add blank after characters + input: ["n", "i3", "h", "ao3"] + output: ["n", "i3", "", "h", "ao3", "", "m", "a5"] + """ + phones = insert_after_character(phones, blank_token) + return phones def main(): @@ -58,29 +103,12 @@ def main(): parser.add_argument( "--speaker-dict", type=str, default=None, help="speaker id map file.") parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") - args = parser.parse_args() + "--add-blank", + type=str2bool, + default=True, + help="whether to add blank between phones") - # set logger - if args.verbose > 1: - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - elif args.verbose > 0: - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - else: - logging.basicConfig( - level=logging.WARN, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - ) - logging.warning('Skip DEBUG/INFO messages') + args = parser.parse_args() dumpdir = Path(args.dumpdir).expanduser() # use absolute path @@ -135,13 +163,19 @@ def main(): else: wav_path = wave - phone_ids = [vocab_phones[p] for p in item['phones']] + phones = item['phones'] + text_lengths = item['text_lengths'] + if args.add_blank: + phones = add_blank(phones, filed="character") + text_lengths = len(phones) + + phone_ids = [vocab_phones[p] for p in phones] spk_id = vocab_speaker[item["speaker"]] record = { "utt_id": item['utt_id'], "text": phone_ids, - "text_lengths": item['text_lengths'], + "text_lengths": text_lengths, 'feats': str(feats_path), "feats_lengths": item['feats_lengths'], "wave": str(wav_path), diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py index 6aa139fb..f89ab356 100644 --- a/paddlespeech/t2s/exps/vits/preprocess.py +++ b/paddlespeech/t2s/exps/vits/preprocess.py @@ -197,11 +197,6 @@ def main(): parser.add_argument("--config", type=str, help="fastspeech2 config file.") - parser.add_argument( - "--verbose", - type=int, - default=1, - help="logging level. higher is more logging. (default=1)") parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") @@ -236,10 +231,6 @@ def main(): with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) - if args.verbose > 1: - print(vars(args)) - print(config) - sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) diff --git a/paddlespeech/t2s/exps/vits/synthesize_e2e.py b/paddlespeech/t2s/exps/vits/synthesize_e2e.py index c82e5c03..33a41375 100644 --- a/paddlespeech/t2s/exps/vits/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/vits/synthesize_e2e.py @@ -23,6 +23,7 @@ from yacs.config import CfgNode from paddlespeech.t2s.exps.syn_utils import get_frontend from paddlespeech.t2s.exps.syn_utils import get_sentences from paddlespeech.t2s.models.vits import VITS +from paddlespeech.t2s.utils import str2bool def evaluate(args): @@ -55,6 +56,7 @@ def evaluate(args): output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) merge_sentences = False + add_blank = args.add_blank N = 0 T = 0 @@ -62,7 +64,9 @@ def evaluate(args): with timer() as t: if args.lang == 'zh': input_ids = frontend.get_input_ids( - sentence, merge_sentences=merge_sentences) + sentence, + merge_sentences=merge_sentences, + add_blank=add_blank) phone_ids = input_ids["phone_ids"] elif args.lang == 'en': input_ids = frontend.get_input_ids( @@ -125,6 +129,12 @@ def parse_args(): help="text to synthesize, a 'utt_id sentence' pair per line.") parser.add_argument("--output_dir", type=str, help="output dir.") + parser.add_argument( + "--add-blank", + type=str2bool, + default=True, + help="whether to add blank between phones") + args = parser.parse_args() return args diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index dbda8b71..1a68d132 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -211,13 +211,18 @@ def train_sp(args, config): generator_first=config.generator_first, output_dir=output_dir) - trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) + trainer = Trainer( + updater, + stop_trigger=(config.train_max_steps, "iteration"), + out=output_dir) if dist.get_rank() == 0: - trainer.extend(evaluator, trigger=(1, "epoch")) - trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) + trainer.extend( + evaluator, trigger=(config.eval_interval_steps, 'iteration')) + trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) trainer.extend( - Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) print("Trainer Done!") trainer.run() diff --git a/paddlespeech/t2s/exps/waveflow/preprocess.py b/paddlespeech/t2s/exps/waveflow/preprocess.py index ef3a2917..c7034aea 100644 --- a/paddlespeech/t2s/exps/waveflow/preprocess.py +++ b/paddlespeech/t2s/exps/waveflow/preprocess.py @@ -143,8 +143,6 @@ if __name__ == "__main__": nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" ) - parser.add_argument( - "-v", "--verbose", action="store_true", help="print msg") config = get_cfg_defaults() args = parser.parse_args() @@ -153,8 +151,5 @@ if __name__ == "__main__": if args.opts: config.merge_from_list(args.opts) config.freeze() - if args.verbose: - print(config.data) - print(args) create_dataset(config.data, args.input, args.output) diff --git a/paddlespeech/t2s/exps/waveflow/synthesize.py b/paddlespeech/t2s/exps/waveflow/synthesize.py index 53715b01..a3190c6e 100644 --- a/paddlespeech/t2s/exps/waveflow/synthesize.py +++ b/paddlespeech/t2s/exps/waveflow/synthesize.py @@ -72,8 +72,6 @@ if __name__ == "__main__": nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" ) - parser.add_argument( - "-v", "--verbose", action="store_true", help="print msg") args = parser.parse_args() if args.config: diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 129aa944..143ccbc1 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -29,6 +29,29 @@ from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer +INITIALS = [ + 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh', + 'r', 'z', 'c', 's', 'j', 'q', 'x' +] +INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil'] + + +def intersperse(lst, item): + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + + +def insert_after_character(lst, item): + result = [item] + for phone in lst: + result.append(phone) + if phone not in INITIALS: + # finals has tones + # assert phone[-1] in "12345" + result.append(item) + return result + class Frontend(): def __init__(self, @@ -280,12 +303,15 @@ class Frontend(): print("----------------------------") return phonemes - def get_input_ids(self, - sentence: str, - merge_sentences: bool=True, - get_tone_ids: bool=False, - robot: bool=False, - print_info: bool=False) -> Dict[str, List[paddle.Tensor]]: + def get_input_ids( + self, + sentence: str, + merge_sentences: bool=True, + get_tone_ids: bool=False, + robot: bool=False, + print_info: bool=False, + add_blank: bool=False, + blank_token: str="") -> Dict[str, List[paddle.Tensor]]: phonemes = self.get_phonemes( sentence, merge_sentences=merge_sentences, @@ -299,6 +325,10 @@ class Frontend(): for part_phonemes in phonemes: phones, tones = self._get_phone_tone( part_phonemes, get_tone_ids=get_tone_ids) + + if add_blank: + phones = insert_after_character(phones, blank_token) + if tones: tone_ids = self._t2id(tones) tone_ids = paddle.to_tensor(tone_ids) diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py index ab8eda26..5c476be7 100644 --- a/paddlespeech/t2s/models/vits/vits.py +++ b/paddlespeech/t2s/models/vits/vits.py @@ -227,11 +227,7 @@ class VITS(nn.Layer): lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). forward_generator (bool): Whether to forward generator. Returns: - Dict[str, Any]: - - loss (Tensor): Loss scalar tensor. - - stats (Dict[str, float]): Statistics to be monitored. - - weight (Tensor): Weight tensor to summarize losses. - - optim_idx (int): Optimizer index (0 for G and 1 for D). + """ if forward_generator: return self._forward_generator( -- GitLab