Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
1731976e
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1731976e
编写于
6月 16, 2022
作者:
小湉湉
提交者:
root
6月 16, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add blank between characters for vits, test=tts
上级
7fc9f5d4
变更
22
显示空白变更内容
内联
并排
Showing
22 changed file
with
136 addition
and
201 deletion
+136
-201
examples/csmsc/vits/conf/default.yaml
examples/csmsc/vits/conf/default.yaml
+5
-3
examples/csmsc/vits/local/preprocess.sh
examples/csmsc/vits/local/preprocess.sh
+4
-0
examples/csmsc/vits/local/synthesize_e2e.sh
examples/csmsc/vits/local/synthesize_e2e.sh
+5
-1
examples/csmsc/vits/run.sh
examples/csmsc/vits/run.sh
+3
-2
examples/ljspeech/voc0/local/synthesize.sh
examples/ljspeech/voc0/local/synthesize.sh
+1
-2
paddlespeech/t2s/exps/fastspeech2/normalize.py
paddlespeech/t2s/exps/fastspeech2/normalize.py
+1
-23
paddlespeech/t2s/exps/fastspeech2/preprocess.py
paddlespeech/t2s/exps/fastspeech2/preprocess.py
+0
-9
paddlespeech/t2s/exps/gan_vocoder/normalize.py
paddlespeech/t2s/exps/gan_vocoder/normalize.py
+1
-23
paddlespeech/t2s/exps/gan_vocoder/preprocess.py
paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+0
-9
paddlespeech/t2s/exps/speedyspeech/normalize.py
paddlespeech/t2s/exps/speedyspeech/normalize.py
+0
-23
paddlespeech/t2s/exps/speedyspeech/preprocess.py
paddlespeech/t2s/exps/speedyspeech/preprocess.py
+0
-9
paddlespeech/t2s/exps/tacotron2/preprocess.py
paddlespeech/t2s/exps/tacotron2/preprocess.py
+0
-9
paddlespeech/t2s/exps/transformer_tts/normalize.py
paddlespeech/t2s/exps/transformer_tts/normalize.py
+1
-23
paddlespeech/t2s/exps/transformer_tts/preprocess.py
paddlespeech/t2s/exps/transformer_tts/preprocess.py
+0
-9
paddlespeech/t2s/exps/vits/normalize.py
paddlespeech/t2s/exps/vits/normalize.py
+58
-24
paddlespeech/t2s/exps/vits/preprocess.py
paddlespeech/t2s/exps/vits/preprocess.py
+0
-9
paddlespeech/t2s/exps/vits/synthesize_e2e.py
paddlespeech/t2s/exps/vits/synthesize_e2e.py
+11
-1
paddlespeech/t2s/exps/vits/train.py
paddlespeech/t2s/exps/vits/train.py
+9
-4
paddlespeech/t2s/exps/waveflow/preprocess.py
paddlespeech/t2s/exps/waveflow/preprocess.py
+0
-5
paddlespeech/t2s/exps/waveflow/synthesize.py
paddlespeech/t2s/exps/waveflow/synthesize.py
+0
-2
paddlespeech/t2s/frontend/zh_frontend.py
paddlespeech/t2s/frontend/zh_frontend.py
+36
-6
paddlespeech/t2s/models/vits/vits.py
paddlespeech/t2s/models/vits/vits.py
+1
-5
未找到文件。
examples/csmsc/vits/conf/default.yaml
浏览文件 @
1731976e
...
...
@@ -178,6 +178,8 @@ generator_first: False # whether to start updating generator first
##########################################################
# OTHER TRAINING SETTING #
##########################################################
max_epoch
:
1000
# number of epochs
num_snapshots
:
10
# max number of snapshots to keep while training
train_max_steps
:
250000
# Number of training steps. == total_iters / ngpus, total_iters = 1000000
save_interval_steps
:
1000
# Interval steps to save checkpoint.
eval_interval_steps
:
250
# Interval steps to evaluate the network.
seed
:
777
# random seed number
examples/csmsc/vits/local/preprocess.sh
浏览文件 @
1731976e
...
...
@@ -4,6 +4,7 @@ stage=0
stop_stage
=
100
config_path
=
$1
add_blank
=
$2
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# get durations from MFA's result
...
...
@@ -44,6 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--feats-stats
=
dump/train/feats_stats.npy
\
--phones-dict
=
dump/phone_id_map.txt
\
--speaker-dict
=
dump/speaker_id_map.txt
\
--add-blank
=
${
add_blank
}
\
--skip-wav-copy
python3
${
BIN_DIR
}
/normalize.py
\
...
...
@@ -52,6 +54,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--feats-stats
=
dump/train/feats_stats.npy
\
--phones-dict
=
dump/phone_id_map.txt
\
--speaker-dict
=
dump/speaker_id_map.txt
\
--add-blank
=
${
add_blank
}
\
--skip-wav-copy
python3
${
BIN_DIR
}
/normalize.py
\
...
...
@@ -60,5 +63,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--feats-stats
=
dump/train/feats_stats.npy
\
--phones-dict
=
dump/phone_id_map.txt
\
--speaker-dict
=
dump/speaker_id_map.txt
\
--add-blank
=
${
add_blank
}
\
--skip-wav-copy
fi
examples/csmsc/vits/local/synthesize_e2e.sh
浏览文件 @
1731976e
...
...
@@ -3,9 +3,12 @@
config_path
=
$1
train_output_path
=
$2
ckpt_name
=
$3
add_blank
=
$4
stage
=
0
stop_stage
=
0
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
FLAGS_allocator_strategy
=
naive_best_fit
\
FLAGS_fraction_of_gpu_memory_to_use
=
0.01
\
...
...
@@ -14,5 +17,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--ckpt
=
${
train_output_path
}
/checkpoints/
${
ckpt_name
}
\
--phones_dict
=
dump/phone_id_map.txt
\
--output_dir
=
${
train_output_path
}
/test_e2e
\
--text
=
${
BIN_DIR
}
/../sentences.txt
--text
=
${
BIN_DIR
}
/../sentences.txt
\
--add-blank
=
${
add_blank
}
fi
examples/csmsc/vits/run.sh
浏览文件 @
1731976e
...
...
@@ -10,6 +10,7 @@ stop_stage=100
conf_path
=
conf/default.yaml
train_output_path
=
exp/default
ckpt_name
=
snapshot_iter_153.pdz
add_blank
=
true
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
...
...
@@ -18,7 +19,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# prepare data
./local/preprocess.sh
${
conf_path
}
||
exit
-1
./local/preprocess.sh
${
conf_path
}
${
add_blank
}
||
exit
-1
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
...
...
@@ -32,5 +33,5 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# synthesize_e2e, vocoder is pwgan
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/synthesize_e2e.sh
${
conf_path
}
${
train_output_path
}
${
ckpt_name
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/synthesize_e2e.sh
${
conf_path
}
${
train_output_path
}
${
ckpt_name
}
${
add_blank
}
||
exit
-1
fi
examples/ljspeech/voc0/local/synthesize.sh
浏览文件 @
1731976e
...
...
@@ -8,5 +8,4 @@ python ${BIN_DIR}/synthesize.py \
--input
=
${
input_mel_path
}
\
--output
=
${
train_output_path
}
/wavs/
\
--checkpoint_path
=
${
train_output_path
}
/checkpoints/
${
ckpt_name
}
\
--ngpu
=
1
\
--verbose
\ No newline at end of file
--ngpu
=
1
\ No newline at end of file
paddlespeech/t2s/exps/fastspeech2/normalize.py
浏览文件 @
1731976e
...
...
@@ -58,30 +58,8 @@ def main():
"--phones-dict"
,
type
=
str
,
default
=
None
,
help
=
"phone vocabulary file."
)
parser
.
add_argument
(
"--speaker-dict"
,
type
=
str
,
default
=
None
,
help
=
"speaker id map file."
)
parser
.
add_argument
(
"--verbose"
,
type
=
int
,
default
=
1
,
help
=
"logging level. higher is more logging. (default=1)"
)
args
=
parser
.
parse_args
()
# set logger
if
args
.
verbose
>
1
:
logging
.
basicConfig
(
level
=
logging
.
DEBUG
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
elif
args
.
verbose
>
0
:
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
else
:
logging
.
basicConfig
(
level
=
logging
.
WARN
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
logging
.
warning
(
'Skip DEBUG/INFO messages'
)
args
=
parser
.
parse_args
()
dumpdir
=
Path
(
args
.
dumpdir
).
expanduser
()
# use absolute path
...
...
paddlespeech/t2s/exps/fastspeech2/preprocess.py
浏览文件 @
1731976e
...
...
@@ -209,11 +209,6 @@ def main():
parser
.
add_argument
(
"--config"
,
type
=
str
,
help
=
"fastspeech2 config file."
)
parser
.
add_argument
(
"--verbose"
,
type
=
int
,
default
=
1
,
help
=
"logging level. higher is more logging. (default=1)"
)
parser
.
add_argument
(
"--num-cpu"
,
type
=
int
,
default
=
1
,
help
=
"number of process."
)
...
...
@@ -248,10 +243,6 @@ def main():
with
open
(
args
.
config
,
'rt'
)
as
f
:
config
=
CfgNode
(
yaml
.
safe_load
(
f
))
if
args
.
verbose
>
1
:
print
(
vars
(
args
))
print
(
config
)
sentences
,
speaker_set
=
get_phn_dur
(
dur_file
)
merge_silence
(
sentences
)
...
...
paddlespeech/t2s/exps/gan_vocoder/normalize.py
浏览文件 @
1731976e
...
...
@@ -47,30 +47,8 @@ def main():
default
=
False
,
action
=
"store_true"
,
help
=
"whether to skip the copy of wav files."
)
parser
.
add_argument
(
"--verbose"
,
type
=
int
,
default
=
1
,
help
=
"logging level. higher is more logging. (default=1)"
)
args
=
parser
.
parse_args
()
# set logger
if
args
.
verbose
>
1
:
logging
.
basicConfig
(
level
=
logging
.
DEBUG
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
elif
args
.
verbose
>
0
:
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
else
:
logging
.
basicConfig
(
level
=
logging
.
WARN
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
logging
.
warning
(
'Skip DEBUG/INFO messages'
)
args
=
parser
.
parse_args
()
dumpdir
=
Path
(
args
.
dumpdir
).
expanduser
()
# use absolute path
...
...
paddlespeech/t2s/exps/gan_vocoder/preprocess.py
浏览文件 @
1731976e
...
...
@@ -167,11 +167,6 @@ def main():
required
=
True
,
help
=
"directory to dump feature files."
)
parser
.
add_argument
(
"--config"
,
type
=
str
,
help
=
"vocoder config file."
)
parser
.
add_argument
(
"--verbose"
,
type
=
int
,
default
=
1
,
help
=
"logging level. higher is more logging. (default=1)"
)
parser
.
add_argument
(
"--num-cpu"
,
type
=
int
,
default
=
1
,
help
=
"number of process."
)
parser
.
add_argument
(
...
...
@@ -197,10 +192,6 @@ def main():
with
open
(
args
.
config
,
'rt'
)
as
f
:
config
=
CfgNode
(
yaml
.
safe_load
(
f
))
if
args
.
verbose
>
1
:
print
(
vars
(
args
))
print
(
config
)
sentences
,
speaker_set
=
get_phn_dur
(
dur_file
)
merge_silence
(
sentences
)
...
...
paddlespeech/t2s/exps/speedyspeech/normalize.py
浏览文件 @
1731976e
...
...
@@ -50,11 +50,6 @@ def main():
"--tones-dict"
,
type
=
str
,
default
=
None
,
help
=
"tone vocabulary file."
)
parser
.
add_argument
(
"--speaker-dict"
,
type
=
str
,
default
=
None
,
help
=
"speaker id map file."
)
parser
.
add_argument
(
"--verbose"
,
type
=
int
,
default
=
1
,
help
=
"logging level. higher is more logging. (default=1)"
)
parser
.
add_argument
(
"--use-relative-path"
,
...
...
@@ -63,24 +58,6 @@ def main():
help
=
"whether use relative path in metadata"
)
args
=
parser
.
parse_args
()
# set logger
if
args
.
verbose
>
1
:
logging
.
basicConfig
(
level
=
logging
.
DEBUG
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
elif
args
.
verbose
>
0
:
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
else
:
logging
.
basicConfig
(
level
=
logging
.
WARN
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
logging
.
warning
(
'Skip DEBUG/INFO messages'
)
dumpdir
=
Path
(
args
.
dumpdir
).
expanduser
()
# use absolute path
dumpdir
=
dumpdir
.
resolve
()
...
...
paddlespeech/t2s/exps/speedyspeech/preprocess.py
浏览文件 @
1731976e
...
...
@@ -195,11 +195,6 @@ def main():
parser
.
add_argument
(
"--config"
,
type
=
str
,
help
=
"fastspeech2 config file."
)
parser
.
add_argument
(
"--verbose"
,
type
=
int
,
default
=
1
,
help
=
"logging level. higher is more logging. (default=1)"
)
parser
.
add_argument
(
"--num-cpu"
,
type
=
int
,
default
=
1
,
help
=
"number of process."
)
...
...
@@ -230,10 +225,6 @@ def main():
with
open
(
args
.
config
,
'rt'
)
as
f
:
config
=
CfgNode
(
yaml
.
safe_load
(
f
))
if
args
.
verbose
>
1
:
print
(
vars
(
args
))
print
(
config
)
sentences
,
speaker_set
=
get_phn_dur
(
dur_file
)
merge_silence
(
sentences
)
...
...
paddlespeech/t2s/exps/tacotron2/preprocess.py
浏览文件 @
1731976e
...
...
@@ -184,11 +184,6 @@ def main():
parser
.
add_argument
(
"--config"
,
type
=
str
,
help
=
"fastspeech2 config file."
)
parser
.
add_argument
(
"--verbose"
,
type
=
int
,
default
=
1
,
help
=
"logging level. higher is more logging. (default=1)"
)
parser
.
add_argument
(
"--num-cpu"
,
type
=
int
,
default
=
1
,
help
=
"number of process."
)
...
...
@@ -223,10 +218,6 @@ def main():
with
open
(
args
.
config
,
'rt'
)
as
f
:
config
=
CfgNode
(
yaml
.
safe_load
(
f
))
if
args
.
verbose
>
1
:
print
(
vars
(
args
))
print
(
config
)
sentences
,
speaker_set
=
get_phn_dur
(
dur_file
)
merge_silence
(
sentences
)
...
...
paddlespeech/t2s/exps/transformer_tts/normalize.py
浏览文件 @
1731976e
...
...
@@ -51,30 +51,8 @@ def main():
"--phones-dict"
,
type
=
str
,
default
=
None
,
help
=
"phone vocabulary file."
)
parser
.
add_argument
(
"--speaker-dict"
,
type
=
str
,
default
=
None
,
help
=
"speaker id map file."
)
parser
.
add_argument
(
"--verbose"
,
type
=
int
,
default
=
1
,
help
=
"logging level. higher is more logging. (default=1)"
)
args
=
parser
.
parse_args
()
# set logger
if
args
.
verbose
>
1
:
logging
.
basicConfig
(
level
=
logging
.
DEBUG
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
elif
args
.
verbose
>
0
:
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
else
:
logging
.
basicConfig
(
level
=
logging
.
WARN
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
logging
.
warning
(
'Skip DEBUG/INFO messages'
)
args
=
parser
.
parse_args
()
# check directory existence
dumpdir
=
Path
(
args
.
dumpdir
).
resolve
()
...
...
paddlespeech/t2s/exps/transformer_tts/preprocess.py
浏览文件 @
1731976e
...
...
@@ -186,11 +186,6 @@ def main():
type
=
str
,
help
=
"yaml format configuration file."
)
parser
.
add_argument
(
"--verbose"
,
type
=
int
,
default
=
1
,
help
=
"logging level. higher is more logging. (default=1)"
)
parser
.
add_argument
(
"--num-cpu"
,
type
=
int
,
default
=
1
,
help
=
"number of process."
)
...
...
@@ -210,10 +205,6 @@ def main():
_C
=
Configuration
(
_C
)
config
=
_C
.
clone
()
if
args
.
verbose
>
1
:
print
(
vars
(
args
))
print
(
config
)
phone_id_map_path
=
dumpdir
/
"phone_id_map.txt"
speaker_id_map_path
=
dumpdir
/
"speaker_id_map.txt"
...
...
paddlespeech/t2s/exps/vits/normalize.py
浏览文件 @
1731976e
...
...
@@ -16,6 +16,7 @@ import argparse
import
logging
from
operator
import
itemgetter
from
pathlib
import
Path
from
typing
import
List
import
jsonlines
import
numpy
as
np
...
...
@@ -23,6 +24,50 @@ from sklearn.preprocessing import StandardScaler
from
tqdm
import
tqdm
from
paddlespeech.t2s.datasets.data_table
import
DataTable
from
paddlespeech.t2s.utils
import
str2bool
INITIALS
=
[
'b'
,
'p'
,
'm'
,
'f'
,
'd'
,
't'
,
'n'
,
'l'
,
'g'
,
'k'
,
'h'
,
'zh'
,
'ch'
,
'sh'
,
'r'
,
'z'
,
'c'
,
's'
,
'j'
,
'q'
,
'x'
]
INITIALS
+=
[
'y'
,
'w'
,
'sp'
,
'spl'
,
'spn'
,
'sil'
]
def
intersperse
(
lst
,
item
):
result
=
[
item
]
*
(
len
(
lst
)
*
2
+
1
)
result
[
1
::
2
]
=
lst
return
result
def
insert_after_character
(
lst
,
item
):
result
=
[
item
]
for
phone
in
lst
:
result
.
append
(
phone
)
if
phone
not
in
INITIALS
:
# finals has tones
assert
phone
[
-
1
]
in
"12345"
result
.
append
(
item
)
return
result
def
add_blank
(
phones
:
List
[
str
],
filed
:
str
=
"character"
,
blank_token
:
str
=
"<pad>"
):
if
filed
==
"phone"
:
"""
add blank after phones
input: ["n", "i3", "h", "ao3", "m", "a5"]
output: ["n", "<pad>", "i3", "<pad>", "h", "<pad>", "ao3", "<pad>", "m", "<pad>", "a5"]
"""
phones
=
intersperse
(
phones
,
blank_token
)
elif
filed
==
"character"
:
"""
add blank after characters
input: ["n", "i3", "h", "ao3"]
output: ["n", "i3", "<pad>", "h", "ao3", "<pad>", "m", "a5"]
"""
phones
=
insert_after_character
(
phones
,
blank_token
)
return
phones
def
main
():
...
...
@@ -58,29 +103,12 @@ def main():
parser
.
add_argument
(
"--speaker-dict"
,
type
=
str
,
default
=
None
,
help
=
"speaker id map file."
)
parser
.
add_argument
(
"--verbose"
,
type
=
int
,
default
=
1
,
help
=
"logging level. higher is more logging. (default=1)"
)
args
=
parser
.
parse_args
()
"--add-blank"
,
type
=
str2bool
,
default
=
True
,
help
=
"whether to add blank between phones"
)
# set logger
if
args
.
verbose
>
1
:
logging
.
basicConfig
(
level
=
logging
.
DEBUG
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
elif
args
.
verbose
>
0
:
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
else
:
logging
.
basicConfig
(
level
=
logging
.
WARN
,
format
=
"%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
)
logging
.
warning
(
'Skip DEBUG/INFO messages'
)
args
=
parser
.
parse_args
()
dumpdir
=
Path
(
args
.
dumpdir
).
expanduser
()
# use absolute path
...
...
@@ -135,13 +163,19 @@ def main():
else
:
wav_path
=
wave
phone_ids
=
[
vocab_phones
[
p
]
for
p
in
item
[
'phones'
]]
phones
=
item
[
'phones'
]
text_lengths
=
item
[
'text_lengths'
]
if
args
.
add_blank
:
phones
=
add_blank
(
phones
,
filed
=
"character"
)
text_lengths
=
len
(
phones
)
phone_ids
=
[
vocab_phones
[
p
]
for
p
in
phones
]
spk_id
=
vocab_speaker
[
item
[
"speaker"
]]
record
=
{
"utt_id"
:
item
[
'utt_id'
],
"text"
:
phone_ids
,
"text_lengths"
:
item
[
'text_lengths'
]
,
"text_lengths"
:
text_lengths
,
'feats'
:
str
(
feats_path
),
"feats_lengths"
:
item
[
'feats_lengths'
],
"wave"
:
str
(
wav_path
),
...
...
paddlespeech/t2s/exps/vits/preprocess.py
浏览文件 @
1731976e
...
...
@@ -197,11 +197,6 @@ def main():
parser
.
add_argument
(
"--config"
,
type
=
str
,
help
=
"fastspeech2 config file."
)
parser
.
add_argument
(
"--verbose"
,
type
=
int
,
default
=
1
,
help
=
"logging level. higher is more logging. (default=1)"
)
parser
.
add_argument
(
"--num-cpu"
,
type
=
int
,
default
=
1
,
help
=
"number of process."
)
...
...
@@ -236,10 +231,6 @@ def main():
with
open
(
args
.
config
,
'rt'
)
as
f
:
config
=
CfgNode
(
yaml
.
safe_load
(
f
))
if
args
.
verbose
>
1
:
print
(
vars
(
args
))
print
(
config
)
sentences
,
speaker_set
=
get_phn_dur
(
dur_file
)
merge_silence
(
sentences
)
...
...
paddlespeech/t2s/exps/vits/synthesize_e2e.py
浏览文件 @
1731976e
...
...
@@ -23,6 +23,7 @@ from yacs.config import CfgNode
from
paddlespeech.t2s.exps.syn_utils
import
get_frontend
from
paddlespeech.t2s.exps.syn_utils
import
get_sentences
from
paddlespeech.t2s.models.vits
import
VITS
from
paddlespeech.t2s.utils
import
str2bool
def
evaluate
(
args
):
...
...
@@ -55,6 +56,7 @@ def evaluate(args):
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
merge_sentences
=
False
add_blank
=
args
.
add_blank
N
=
0
T
=
0
...
...
@@ -62,7 +64,9 @@ def evaluate(args):
with
timer
()
as
t
:
if
args
.
lang
==
'zh'
:
input_ids
=
frontend
.
get_input_ids
(
sentence
,
merge_sentences
=
merge_sentences
)
sentence
,
merge_sentences
=
merge_sentences
,
add_blank
=
add_blank
)
phone_ids
=
input_ids
[
"phone_ids"
]
elif
args
.
lang
==
'en'
:
input_ids
=
frontend
.
get_input_ids
(
...
...
@@ -125,6 +129,12 @@ def parse_args():
help
=
"text to synthesize, a 'utt_id sentence' pair per line."
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
help
=
"output dir."
)
parser
.
add_argument
(
"--add-blank"
,
type
=
str2bool
,
default
=
True
,
help
=
"whether to add blank between phones"
)
args
=
parser
.
parse_args
()
return
args
...
...
paddlespeech/t2s/exps/vits/train.py
浏览文件 @
1731976e
...
...
@@ -211,13 +211,18 @@ def train_sp(args, config):
generator_first
=
config
.
generator_first
,
output_dir
=
output_dir
)
trainer
=
Trainer
(
updater
,
(
config
.
max_epoch
,
'epoch'
),
output_dir
)
trainer
=
Trainer
(
updater
,
stop_trigger
=
(
config
.
train_max_steps
,
"iteration"
),
out
=
output_dir
)
if
dist
.
get_rank
()
==
0
:
trainer
.
extend
(
evaluator
,
trigger
=
(
1
,
"epoch"
))
trainer
.
extend
(
VisualDL
(
output_dir
),
trigger
=
(
1
,
"iteration"
))
trainer
.
extend
(
Snapshot
(
max_size
=
config
.
num_snapshots
),
trigger
=
(
1
,
'epoch'
))
evaluator
,
trigger
=
(
config
.
eval_interval_steps
,
'iteration'
))
trainer
.
extend
(
VisualDL
(
output_dir
),
trigger
=
(
1
,
'iteration'
))
trainer
.
extend
(
Snapshot
(
max_size
=
config
.
num_snapshots
),
trigger
=
(
config
.
save_interval_steps
,
'iteration'
))
print
(
"Trainer Done!"
)
trainer
.
run
()
...
...
paddlespeech/t2s/exps/waveflow/preprocess.py
浏览文件 @
1731976e
...
...
@@ -143,8 +143,6 @@ if __name__ == "__main__":
nargs
=
argparse
.
REMAINDER
,
help
=
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser
.
add_argument
(
"-v"
,
"--verbose"
,
action
=
"store_true"
,
help
=
"print msg"
)
config
=
get_cfg_defaults
()
args
=
parser
.
parse_args
()
...
...
@@ -153,8 +151,5 @@ if __name__ == "__main__":
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
freeze
()
if
args
.
verbose
:
print
(
config
.
data
)
print
(
args
)
create_dataset
(
config
.
data
,
args
.
input
,
args
.
output
)
paddlespeech/t2s/exps/waveflow/synthesize.py
浏览文件 @
1731976e
...
...
@@ -72,8 +72,6 @@ if __name__ == "__main__":
nargs
=
argparse
.
REMAINDER
,
help
=
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser
.
add_argument
(
"-v"
,
"--verbose"
,
action
=
"store_true"
,
help
=
"print msg"
)
args
=
parser
.
parse_args
()
if
args
.
config
:
...
...
paddlespeech/t2s/frontend/zh_frontend.py
浏览文件 @
1731976e
...
...
@@ -29,6 +29,29 @@ from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
from
paddlespeech.t2s.frontend.tone_sandhi
import
ToneSandhi
from
paddlespeech.t2s.frontend.zh_normalization.text_normlization
import
TextNormalizer
INITIALS
=
[
'b'
,
'p'
,
'm'
,
'f'
,
'd'
,
't'
,
'n'
,
'l'
,
'g'
,
'k'
,
'h'
,
'zh'
,
'ch'
,
'sh'
,
'r'
,
'z'
,
'c'
,
's'
,
'j'
,
'q'
,
'x'
]
INITIALS
+=
[
'y'
,
'w'
,
'sp'
,
'spl'
,
'spn'
,
'sil'
]
def
intersperse
(
lst
,
item
):
result
=
[
item
]
*
(
len
(
lst
)
*
2
+
1
)
result
[
1
::
2
]
=
lst
return
result
def
insert_after_character
(
lst
,
item
):
result
=
[
item
]
for
phone
in
lst
:
result
.
append
(
phone
)
if
phone
not
in
INITIALS
:
# finals has tones
# assert phone[-1] in "12345"
result
.
append
(
item
)
return
result
class
Frontend
():
def
__init__
(
self
,
...
...
@@ -280,12 +303,15 @@ class Frontend():
print
(
"----------------------------"
)
return
phonemes
def
get_input_ids
(
self
,
def
get_input_ids
(
self
,
sentence
:
str
,
merge_sentences
:
bool
=
True
,
get_tone_ids
:
bool
=
False
,
robot
:
bool
=
False
,
print_info
:
bool
=
False
)
->
Dict
[
str
,
List
[
paddle
.
Tensor
]]:
print_info
:
bool
=
False
,
add_blank
:
bool
=
False
,
blank_token
:
str
=
"<pad>"
)
->
Dict
[
str
,
List
[
paddle
.
Tensor
]]:
phonemes
=
self
.
get_phonemes
(
sentence
,
merge_sentences
=
merge_sentences
,
...
...
@@ -299,6 +325,10 @@ class Frontend():
for
part_phonemes
in
phonemes
:
phones
,
tones
=
self
.
_get_phone_tone
(
part_phonemes
,
get_tone_ids
=
get_tone_ids
)
if
add_blank
:
phones
=
insert_after_character
(
phones
,
blank_token
)
if
tones
:
tone_ids
=
self
.
_t2id
(
tones
)
tone_ids
=
paddle
.
to_tensor
(
tone_ids
)
...
...
paddlespeech/t2s/models/vits/vits.py
浏览文件 @
1731976e
...
...
@@ -227,11 +227,7 @@ class VITS(nn.Layer):
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
forward_generator (bool): Whether to forward generator.
Returns:
Dict[str, Any]:
- loss (Tensor): Loss scalar tensor.
- stats (Dict[str, float]): Statistics to be monitored.
- weight (Tensor): Weight tensor to summarize losses.
- optim_idx (int): Optimizer index (0 for G and 1 for D).
"""
if
forward_generator
:
return
self
.
_forward_generator
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录