Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
ERNIE
提交
76b654cb
E
ERNIE
项目概览
PaddlePaddle
/
ERNIE
大约 1 年 前同步成功
通知
109
Star
5997
Fork
1270
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
29
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
E
ERNIE
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
29
Issue
29
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
76b654cb
编写于
6月 09, 2022
作者:
小湉湉
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
format ernie sat
上级
6bcb213c
变更
13
展开全部
隐藏空白更改
内联
并排
Showing
13 changed file
with
980 addition
and
1123 deletion
+980
-1123
ernie-sat/README.md
ernie-sat/README.md
+2
-2
ernie-sat/align.py
ernie-sat/align.py
+117
-46
ernie-sat/align_mandarin.py
ernie-sat/align_mandarin.py
+0
-186
ernie-sat/dataset.py
ernie-sat/dataset.py
+209
-285
ernie-sat/inference.py
ernie-sat/inference.py
+351
-416
ernie-sat/model_paddle.py
ernie-sat/model_paddle.py
+51
-59
ernie-sat/paddlespeech/t2s/modules/nets_utils.py
ernie-sat/paddlespeech/t2s/modules/nets_utils.py
+154
-12
ernie-sat/run_clone_en_to_zh.sh
ernie-sat/run_clone_en_to_zh.sh
+2
-2
ernie-sat/run_gen_en.sh
ernie-sat/run_gen_en.sh
+2
-2
ernie-sat/run_sedit_en.sh
ernie-sat/run_sedit_en.sh
+2
-2
ernie-sat/sedit_arg_parser.py
ernie-sat/sedit_arg_parser.py
+2
-4
ernie-sat/tools/torch_pwgan.py
ernie-sat/tools/torch_pwgan.py
+1
-1
ernie-sat/utils.py
ernie-sat/utils.py
+87
-106
未找到文件。
ernie-sat/README.md
浏览文件 @
76b654cb
...
...
@@ -113,8 +113,8 @@ prompt/dev
8.
` --uid`
特定提示(prompt)语音的 id
9.
` --new_str`
输入的文本(本次开源暂时先设置特定的文本)
10.
` --prefix`
特定音频对应的文本、音素相关文件的地址
11.
` --source_lang
uage
`
, 源语言
12.
` --target_lang
uage
`
, 目标语言
11.
` --source_lang`
, 源语言
12.
` --target_lang`
, 目标语言
13.
` --output_name`
, 合成语音名称
14.
` --task_name`
, 任务名称, 包括:语音编辑任务、个性化语音合成任务、跨语言语音合成任务
15.
` --use_pt_vocoder`
, 英文场景下是否使用 torch 版本的 vocoder, 默认情况下为 False; 设置为 False 则在英文场景下使用 paddle 版本 vocoder
...
...
ernie-sat/align
_english
.py
→
ernie-sat/align.py
浏览文件 @
76b654cb
#!/usr/bin/env python
""" Usage:
align
_english
.py wavfile trsfile outwordfile outphonefile
align.py wavfile trsfile outwordfile outphonefile
"""
import
multiprocessing
as
mp
import
os
...
...
@@ -9,12 +9,45 @@ import sys
from
tqdm
import
tqdm
PHONEME
=
'tools/aligner/english_envir/english2phoneme/phoneme'
MODEL_DIR
=
'tools/aligner/english'
MODEL_DIR_EN
=
'tools/aligner/english'
MODEL_DIR_ZH
=
'tools/aligner/mandarin'
HVITE
=
'tools/htk/HTKTools/HVite'
HCOPY
=
'tools/htk/HTKTools/HCopy'
def
prep_txt
(
line
,
tmpbase
,
dictfile
):
def
prep_txt_zh
(
line
:
str
,
tmpbase
:
str
,
dictfile
:
str
):
words
=
[]
line
=
line
.
strip
()
for
pun
in
[
','
,
'.'
,
':'
,
';'
,
'!'
,
'?'
,
'"'
,
'('
,
')'
,
'--'
,
'---'
,
u
','
,
u
'。'
,
u
':'
,
u
';'
,
u
'!'
,
u
'?'
,
u
'('
,
u
')'
]:
line
=
line
.
replace
(
pun
,
' '
)
for
wrd
in
line
.
split
():
if
(
wrd
[
-
1
]
==
'-'
):
wrd
=
wrd
[:
-
1
]
if
(
wrd
[
0
]
==
"'"
):
wrd
=
wrd
[
1
:]
if
wrd
:
words
.
append
(
wrd
)
ds
=
set
([])
with
open
(
dictfile
,
'r'
)
as
fid
:
for
line
in
fid
:
ds
.
add
(
line
.
split
()[
0
])
unk_words
=
set
([])
with
open
(
tmpbase
+
'.txt'
,
'w'
)
as
fwid
:
for
wrd
in
words
:
if
(
wrd
not
in
ds
):
unk_words
.
add
(
wrd
)
fwid
.
write
(
wrd
+
' '
)
fwid
.
write
(
'
\n
'
)
return
unk_words
def
prep_txt_en
(
line
:
str
,
tmpbase
,
dictfile
):
words
=
[]
...
...
@@ -97,7 +130,7 @@ def prep_txt(line, tmpbase, dictfile):
fw
.
close
()
def
prep_mlf
(
txt
,
tmpbase
):
def
prep_mlf
(
txt
:
str
,
tmpbase
:
str
):
with
open
(
tmpbase
+
'.mlf'
,
'w'
)
as
fwid
:
fwid
.
write
(
'#!MLF!#
\n
'
)
...
...
@@ -110,7 +143,55 @@ def prep_mlf(txt, tmpbase):
fwid
.
write
(
'.
\n
'
)
def
gen_res
(
tmpbase
,
outfile1
,
outfile2
):
def
_get_user
():
return
os
.
path
.
expanduser
(
'~'
).
split
(
"/"
)[
-
1
]
def
alignment
(
wav_path
:
str
,
text
:
str
):
tmpbase
=
'/tmp/'
+
_get_user
()
+
'_'
+
str
(
os
.
getpid
())
#prepare wav and trs files
try
:
os
.
system
(
'sox '
+
wav_path
+
' -r 16000 '
+
tmpbase
+
'.wav remix -'
)
except
:
print
(
'sox error!'
)
return
None
#prepare clean_transcript file
try
:
prep_txt_en
(
text
,
tmpbase
,
MODEL_DIR_EN
+
'/dict'
)
except
:
print
(
'prep_txt error!'
)
return
None
#prepare mlf file
try
:
with
open
(
tmpbase
+
'.txt'
,
'r'
)
as
fid
:
txt
=
fid
.
readline
()
prep_mlf
(
txt
,
tmpbase
)
except
:
print
(
'prep_mlf error!'
)
return
None
#prepare scp
try
:
os
.
system
(
HCOPY
+
' -C '
+
MODEL_DIR_EN
+
'/16000/config '
+
tmpbase
+
'.wav'
+
' '
+
tmpbase
+
'.plp'
)
except
:
print
(
'HCopy error!'
)
return
None
#run alignment
try
:
os
.
system
(
HVITE
+
' -a -m -t 10000.0 10000.0 100000.0 -I '
+
tmpbase
+
'.mlf -H '
+
MODEL_DIR_EN
+
'/16000/macros -H '
+
MODEL_DIR_EN
+
'/16000/hmmdefs -i '
+
tmpbase
+
'.aligned '
+
tmpbase
+
'.dict '
+
MODEL_DIR_EN
+
'/monophones '
+
tmpbase
+
'.plp 2>&1 > /dev/null'
)
except
:
print
(
'HVite error!'
)
return
None
with
open
(
tmpbase
+
'.txt'
,
'r'
)
as
fid
:
words
=
fid
.
readline
().
strip
().
split
()
words
=
txt
.
strip
().
split
()
...
...
@@ -119,59 +200,47 @@ def gen_res(tmpbase, outfile1, outfile2):
with
open
(
tmpbase
+
'.aligned'
,
'r'
)
as
fid
:
lines
=
fid
.
readlines
()
i
=
2
times1
=
[]
times2
=
[]
word2phns
=
{}
current_word
=
''
index
=
0
while
(
i
<
len
(
lines
)):
if
(
len
(
lines
[
i
].
split
())
>=
4
)
and
(
lines
[
i
].
split
()[
0
]
!=
lines
[
i
].
split
()
[
1
]):
phn
=
lines
[
i
].
split
()
[
2
]
pst
=
(
int
(
lines
[
i
].
split
()
[
0
])
/
1000
+
125
)
/
10000
pen
=
(
int
(
lines
[
i
].
split
()
[
1
])
/
1000
+
125
)
/
10000
splited_line
=
lines
[
i
].
strip
().
split
()
if
(
len
(
splited_line
)
>=
4
)
and
(
splited_line
[
0
]
!=
splited_line
[
1
]):
phn
=
splited_line
[
2
]
pst
=
(
int
(
splited_line
[
0
])
/
1000
+
125
)
/
10000
pen
=
(
int
(
splited_line
[
1
])
/
1000
+
125
)
/
10000
times2
.
append
([
phn
,
pst
,
pen
])
if
(
len
(
lines
[
i
].
split
())
==
5
):
if
(
lines
[
i
].
split
()[
0
]
!=
lines
[
i
].
split
()[
1
]):
wrd
=
lines
[
i
].
split
()[
-
1
].
strip
()
st
=
(
int
(
lines
[
i
].
split
()[
0
])
/
1000
+
125
)
/
10000
j
=
i
+
1
while
(
lines
[
j
]
!=
'.
\n
'
)
and
(
len
(
lines
[
j
].
split
())
!=
5
):
j
+=
1
en
=
(
int
(
lines
[
j
-
1
].
split
()[
1
])
/
1000
+
125
)
/
10000
times1
.
append
([
wrd
,
st
,
en
])
# splited_line[-1]!='sp'
if
len
(
splited_line
)
==
5
:
current_word
=
str
(
index
)
+
'_'
+
splited_line
[
-
1
]
word2phns
[
current_word
]
=
phn
index
+=
1
elif
len
(
splited_line
)
==
4
:
word2phns
[
current_word
]
+=
' '
+
phn
i
+=
1
with
open
(
outfile1
,
'w'
)
as
fwid
:
for
item
in
times1
:
if
(
item
[
0
]
==
'sp'
):
fwid
.
write
(
str
(
item
[
1
])
+
' '
+
str
(
item
[
2
])
+
' SIL
\n
'
)
else
:
wrd
=
words
.
pop
()
fwid
.
write
(
str
(
item
[
1
])
+
' '
+
str
(
item
[
2
])
+
' '
+
wrd
+
'
\n
'
)
if
words
:
print
(
'not matched::'
+
alignfile
)
sys
.
exit
(
1
)
with
open
(
outfile2
,
'w'
)
as
fwid
:
for
item
in
times2
:
fwid
.
write
(
str
(
item
[
1
])
+
' '
+
str
(
item
[
2
])
+
' '
+
item
[
0
]
+
'
\n
'
)
def
_get_user
():
return
os
.
path
.
expanduser
(
'~'
).
split
(
"/"
)[
-
1
]
return
times2
,
word2phns
def
alignment
(
wav_path
,
text_string
):
def
alignment
_zh
(
wav_path
,
text_string
):
tmpbase
=
'/tmp/'
+
_get_user
()
+
'_'
+
str
(
os
.
getpid
())
#prepare wav and trs files
try
:
os
.
system
(
'sox '
+
wav_path
+
' -r 16000 '
+
tmpbase
+
'.wav remix -'
)
os
.
system
(
'sox '
+
wav_path
+
' -r 16000 -b 16 '
+
tmpbase
+
'.wav remix -'
)
except
:
print
(
'sox error!'
)
return
None
#prepare clean_transcript file
try
:
prep_txt
(
text_string
,
tmpbase
,
MODEL_DIR
+
'/dict'
)
unk_words
=
prep_txt_zh
(
text_string
,
tmpbase
,
MODEL_DIR_ZH
+
'/dict'
)
if
unk_words
:
print
(
'Error! Please add the following words to dictionary:'
)
for
unk
in
unk_words
:
print
(
"非法words: "
,
unk
)
except
:
print
(
'prep_txt error!'
)
return
None
...
...
@@ -187,7 +256,7 @@ def alignment(wav_path, text_string):
#prepare scp
try
:
os
.
system
(
HCOPY
+
' -C '
+
MODEL_DIR
+
'/16000/config '
+
tmpbase
+
os
.
system
(
HCOPY
+
' -C '
+
MODEL_DIR
_ZH
+
'/16000/config '
+
tmpbase
+
'.wav'
+
' '
+
tmpbase
+
'.plp'
)
except
:
print
(
'HCopy error!'
)
...
...
@@ -196,10 +265,11 @@ def alignment(wav_path, text_string):
#run alignment
try
:
os
.
system
(
HVITE
+
' -a -m -t 10000.0 10000.0 100000.0 -I '
+
tmpbase
+
'.mlf -H '
+
MODEL_DIR
+
'/16000/macros -H '
+
MODEL_DIR
+
'/16000/hmmdefs -i '
+
tmpbase
+
'.aligned '
+
tmpbase
+
'.dict '
+
MODEL_DIR
+
'/monophones '
+
tmpbase
+
'.mlf -H '
+
MODEL_DIR
_ZH
+
'/16000/macros -H '
+
MODEL_DIR_ZH
+
'/16000/hmmdefs -i '
+
tmpbase
+
'.aligned '
+
MODEL_DIR_ZH
+
'/dict '
+
MODEL_DIR_ZH
+
'/monophones '
+
tmpbase
+
'.plp 2>&1 > /dev/null'
)
except
:
print
(
'HVite error!'
)
return
None
...
...
@@ -211,6 +281,7 @@ def alignment(wav_path, text_string):
with
open
(
tmpbase
+
'.aligned'
,
'r'
)
as
fid
:
lines
=
fid
.
readlines
()
i
=
2
times2
=
[]
word2phns
=
{}
...
...
ernie-sat/align_mandarin.py
已删除
100755 → 0
浏览文件 @
6bcb213c
#!/usr/bin/env python
""" Usage:
align_mandarin.py wavfile trsfile outwordfile putphonefile
"""
import
multiprocessing
as
mp
import
os
import
sys
from
tqdm
import
tqdm
MODEL_DIR
=
'tools/aligner/mandarin'
HVITE
=
'tools/htk/HTKTools/HVite'
HCOPY
=
'tools/htk/HTKTools/HCopy'
def
prep_txt
(
line
,
tmpbase
,
dictfile
):
words
=
[]
line
=
line
.
strip
()
for
pun
in
[
','
,
'.'
,
':'
,
';'
,
'!'
,
'?'
,
'"'
,
'('
,
')'
,
'--'
,
'---'
,
u
','
,
u
'。'
,
u
':'
,
u
';'
,
u
'!'
,
u
'?'
,
u
'('
,
u
')'
]:
line
=
line
.
replace
(
pun
,
' '
)
for
wrd
in
line
.
split
():
if
(
wrd
[
-
1
]
==
'-'
):
wrd
=
wrd
[:
-
1
]
if
(
wrd
[
0
]
==
"'"
):
wrd
=
wrd
[
1
:]
if
wrd
:
words
.
append
(
wrd
)
ds
=
set
([])
with
open
(
dictfile
,
'r'
)
as
fid
:
for
line
in
fid
:
ds
.
add
(
line
.
split
()[
0
])
unk_words
=
set
([])
with
open
(
tmpbase
+
'.txt'
,
'w'
)
as
fwid
:
for
wrd
in
words
:
if
(
wrd
not
in
ds
):
unk_words
.
add
(
wrd
)
fwid
.
write
(
wrd
+
' '
)
fwid
.
write
(
'
\n
'
)
return
unk_words
def
prep_mlf
(
txt
,
tmpbase
):
with
open
(
tmpbase
+
'.mlf'
,
'w'
)
as
fwid
:
fwid
.
write
(
'#!MLF!#
\n
'
)
fwid
.
write
(
'"'
+
tmpbase
+
'.lab"
\n
'
)
fwid
.
write
(
'sp
\n
'
)
wrds
=
txt
.
split
()
for
wrd
in
wrds
:
fwid
.
write
(
wrd
.
upper
()
+
'
\n
'
)
fwid
.
write
(
'sp
\n
'
)
fwid
.
write
(
'.
\n
'
)
def
gen_res
(
tmpbase
,
outfile1
,
outfile2
):
with
open
(
tmpbase
+
'.txt'
,
'r'
)
as
fid
:
words
=
fid
.
readline
().
strip
().
split
()
words
=
txt
.
strip
().
split
()
words
.
reverse
()
with
open
(
tmpbase
+
'.aligned'
,
'r'
)
as
fid
:
lines
=
fid
.
readlines
()
i
=
2
times1
=
[]
times2
=
[]
while
(
i
<
len
(
lines
)):
if
(
len
(
lines
[
i
].
split
())
>=
4
)
and
(
lines
[
i
].
split
()[
0
]
!=
lines
[
i
].
split
()[
1
]):
phn
=
lines
[
i
].
split
()[
2
]
pst
=
(
int
(
lines
[
i
].
split
()[
0
])
/
1000
+
125
)
/
10000
pen
=
(
int
(
lines
[
i
].
split
()[
1
])
/
1000
+
125
)
/
10000
times2
.
append
([
phn
,
pst
,
pen
])
if
(
len
(
lines
[
i
].
split
())
==
5
):
if
(
lines
[
i
].
split
()[
0
]
!=
lines
[
i
].
split
()[
1
]):
wrd
=
lines
[
i
].
split
()[
-
1
].
strip
()
st
=
(
int
(
lines
[
i
].
split
()[
0
])
/
1000
+
125
)
/
10000
j
=
i
+
1
while
(
lines
[
j
]
!=
'.
\n
'
)
and
(
len
(
lines
[
j
].
split
())
!=
5
):
j
+=
1
en
=
(
int
(
lines
[
j
-
1
].
split
()[
1
])
/
1000
+
125
)
/
10000
times1
.
append
([
wrd
,
st
,
en
])
i
+=
1
with
open
(
outfile1
,
'w'
)
as
fwid
:
for
item
in
times1
:
if
(
item
[
0
]
==
'sp'
):
fwid
.
write
(
str
(
item
[
1
])
+
' '
+
str
(
item
[
2
])
+
' SIL
\n
'
)
else
:
wrd
=
words
.
pop
()
fwid
.
write
(
str
(
item
[
1
])
+
' '
+
str
(
item
[
2
])
+
' '
+
wrd
+
'
\n
'
)
if
words
:
print
(
'not matched::'
+
alignfile
)
sys
.
exit
(
1
)
with
open
(
outfile2
,
'w'
)
as
fwid
:
for
item
in
times2
:
fwid
.
write
(
str
(
item
[
1
])
+
' '
+
str
(
item
[
2
])
+
' '
+
item
[
0
]
+
'
\n
'
)
def
alignment_zh
(
wav_path
,
text_string
):
tmpbase
=
'/tmp/'
+
os
.
environ
[
'USER'
]
+
'_'
+
str
(
os
.
getpid
())
#prepare wav and trs files
try
:
os
.
system
(
'sox '
+
wav_path
+
' -r 16000 -b 16 '
+
tmpbase
+
'.wav remix -'
)
except
:
print
(
'sox error!'
)
return
None
#prepare clean_transcript file
try
:
unk_words
=
prep_txt
(
text_string
,
tmpbase
,
MODEL_DIR
+
'/dict'
)
if
unk_words
:
print
(
'Error! Please add the following words to dictionary:'
)
for
unk
in
unk_words
:
print
(
"非法words: "
,
unk
)
except
:
print
(
'prep_txt error!'
)
return
None
#prepare mlf file
try
:
with
open
(
tmpbase
+
'.txt'
,
'r'
)
as
fid
:
txt
=
fid
.
readline
()
prep_mlf
(
txt
,
tmpbase
)
except
:
print
(
'prep_mlf error!'
)
return
None
#prepare scp
try
:
os
.
system
(
HCOPY
+
' -C '
+
MODEL_DIR
+
'/16000/config '
+
tmpbase
+
'.wav'
+
' '
+
tmpbase
+
'.plp'
)
except
:
print
(
'HCopy error!'
)
return
None
#run alignment
try
:
os
.
system
(
HVITE
+
' -a -m -t 10000.0 10000.0 100000.0 -I '
+
tmpbase
+
'.mlf -H '
+
MODEL_DIR
+
'/16000/macros -H '
+
MODEL_DIR
+
'/16000/hmmdefs -i '
+
tmpbase
+
'.aligned '
+
MODEL_DIR
+
'/dict '
+
MODEL_DIR
+
'/monophones '
+
tmpbase
+
'.plp 2>&1 > /dev/null'
)
except
:
print
(
'HVite error!'
)
return
None
with
open
(
tmpbase
+
'.txt'
,
'r'
)
as
fid
:
words
=
fid
.
readline
().
strip
().
split
()
words
=
txt
.
strip
().
split
()
words
.
reverse
()
with
open
(
tmpbase
+
'.aligned'
,
'r'
)
as
fid
:
lines
=
fid
.
readlines
()
i
=
2
times2
=
[]
word2phns
=
{}
current_word
=
''
index
=
0
while
(
i
<
len
(
lines
)):
splited_line
=
lines
[
i
].
strip
().
split
()
if
(
len
(
splited_line
)
>=
4
)
and
(
splited_line
[
0
]
!=
splited_line
[
1
]):
phn
=
splited_line
[
2
]
pst
=
(
int
(
splited_line
[
0
])
/
1000
+
125
)
/
10000
pen
=
(
int
(
splited_line
[
1
])
/
1000
+
125
)
/
10000
times2
.
append
([
phn
,
pst
,
pen
])
# splited_line[-1]!='sp'
if
len
(
splited_line
)
==
5
:
current_word
=
str
(
index
)
+
'_'
+
splited_line
[
-
1
]
word2phns
[
current_word
]
=
phn
index
+=
1
elif
len
(
splited_line
)
==
4
:
word2phns
[
current_word
]
+=
' '
+
phn
i
+=
1
return
times2
,
word2phns
ernie-sat/dataset.py
浏览文件 @
76b654cb
此差异已折叠。
点击以展开。
ernie-sat/inference.py
浏览文件 @
76b654cb
此差异已折叠。
点击以展开。
ernie-sat/model_paddle.py
浏览文件 @
76b654cb
...
...
@@ -121,12 +121,10 @@ class NewMaskInputLayer(nn.Layer):
default_initializer
=
paddle
.
nn
.
initializer
.
Assign
(
paddle
.
normal
(
shape
=
(
1
,
1
,
out_features
))))
def
forward
(
self
,
input
:
paddle
.
Tensor
,
masked_position
=
None
)
->
paddle
.
Tensor
:
masked_position
=
paddle
.
expand_as
(
paddle
.
unsqueeze
(
masked_position
,
-
1
),
input
)
masked_input
=
masked_fill
(
input
,
masked_position
,
0
)
+
masked_fill
(
paddle
.
expand_as
(
self
.
mask_feature
,
input
),
~
masked_position
,
0
)
def
forward
(
self
,
input
:
paddle
.
Tensor
,
masked_pos
=
None
)
->
paddle
.
Tensor
:
masked_pos
=
paddle
.
expand_as
(
paddle
.
unsqueeze
(
masked_pos
,
-
1
),
input
)
masked_input
=
masked_fill
(
input
,
masked_pos
,
0
)
+
masked_fill
(
paddle
.
expand_as
(
self
.
mask_feature
,
input
),
~
masked_pos
,
0
)
return
masked_input
...
...
@@ -443,37 +441,34 @@ class MLMEncoder(nn.Layer):
def
forward
(
self
,
speech_pad
,
text_pad
,
masked_pos
ition
,
masked_pos
,
speech_mask
=
None
,
text_mask
=
None
,
speech_seg
ment
_pos
=
None
,
text_seg
ment
_pos
=
None
):
speech_seg_pos
=
None
,
text_seg_pos
=
None
):
"""Encode input sequence.
"""
if
masked_pos
ition
is
not
None
:
speech_pad
=
self
.
speech_embed
(
speech_pad
,
masked_pos
ition
)
if
masked_pos
is
not
None
:
speech_pad
=
self
.
speech_embed
(
speech_pad
,
masked_pos
)
else
:
speech_pad
=
self
.
speech_embed
(
speech_pad
)
# pure speech input
if
-
2
in
np
.
array
(
text_pad
):
text_pad
=
text_pad
+
3
text_mask
=
paddle
.
unsqueeze
(
bool
(
text_pad
),
1
)
text_seg
ment
_pos
=
paddle
.
zeros_like
(
text_pad
)
text_seg_pos
=
paddle
.
zeros_like
(
text_pad
)
text_pad
=
self
.
text_embed
(
text_pad
)
text_pad
=
(
text_pad
[
0
]
+
self
.
segment_emb
(
text_seg
ment
_pos
),
text_pad
=
(
text_pad
[
0
]
+
self
.
segment_emb
(
text_seg_pos
),
text_pad
[
1
])
text_seg
ment
_pos
=
None
text_seg_pos
=
None
elif
text_pad
is
not
None
:
text_pad
=
self
.
text_embed
(
text_pad
)
segment_emb
=
None
if
speech_segment_pos
is
not
None
and
text_segment_pos
is
not
None
and
self
.
segment_emb
:
speech_segment_emb
=
self
.
segment_emb
(
speech_segment_pos
)
text_segment_emb
=
self
.
segment_emb
(
text_segment_pos
)
text_pad
=
(
text_pad
[
0
]
+
text_segment_emb
,
text_pad
[
1
])
speech_pad
=
(
speech_pad
[
0
]
+
speech_segment_emb
,
speech_pad
[
1
])
segment_emb
=
paddle
.
concat
(
[
speech_segment_emb
,
text_segment_emb
],
axis
=
1
)
if
speech_seg_pos
is
not
None
and
text_seg_pos
is
not
None
and
self
.
segment_emb
:
speech_seg_emb
=
self
.
segment_emb
(
speech_seg_pos
)
text_seg_emb
=
self
.
segment_emb
(
text_seg_pos
)
text_pad
=
(
text_pad
[
0
]
+
text_seg_emb
,
text_pad
[
1
])
speech_pad
=
(
speech_pad
[
0
]
+
speech_seg_emb
,
speech_pad
[
1
])
if
self
.
pre_speech_encoders
:
speech_pad
,
_
=
self
.
pre_speech_encoders
(
speech_pad
,
speech_mask
)
...
...
@@ -493,11 +488,11 @@ class MLMEncoder(nn.Layer):
if
self
.
normalize_before
:
xs
=
self
.
after_norm
(
xs
)
return
xs
,
masks
#, segment_emb
return
xs
,
masks
class
MLMDecoder
(
MLMEncoder
):
def
forward
(
self
,
xs
,
masks
,
masked_pos
ition
=
None
,
segment_emb
=
None
):
def
forward
(
self
,
xs
,
masks
,
masked_pos
=
None
,
segment_emb
=
None
):
"""Encode input sequence.
Args:
...
...
@@ -509,9 +504,8 @@ class MLMDecoder(MLMEncoder):
paddle.Tensor: Mask tensor (#batch, time).
"""
emb
,
mlm_position
=
None
,
None
if
not
self
.
training
:
masked_pos
ition
=
None
masked_pos
=
None
xs
=
self
.
embed
(
xs
)
if
segment_emb
:
xs
=
(
xs
[
0
]
+
segment_emb
,
xs
[
1
])
...
...
@@ -632,18 +626,18 @@ class MLMModel(nn.Layer):
def
collect_feats
(
self
,
speech
,
speech_len
gth
s
,
speech_lens
,
text
,
text_len
gth
s
,
masked_pos
ition
,
text_lens
,
masked_pos
,
speech_mask
,
text_mask
,
speech_seg
ment
_pos
,
text_seg
ment
_pos
,
speech_seg_pos
,
text_seg_pos
,
y_masks
=
None
)
->
Dict
[
str
,
paddle
.
Tensor
]:
return
{
"feats"
:
speech
,
"feats_len
gths"
:
speech_length
s
}
return
{
"feats"
:
speech
,
"feats_len
s"
:
speech_len
s
}
def
forward
(
self
,
batch
,
speech_seg
ment
_pos
,
y_masks
=
None
):
def
forward
(
self
,
batch
,
speech_seg_pos
,
y_masks
=
None
):
# feats: (Batch, Length, Dim)
# -> encoder_out: (Batch, Length2, Dim2)
speech_pad_placeholder
=
batch
[
'speech_pad'
]
...
...
@@ -654,7 +648,7 @@ class MLMModel(nn.Layer):
if
self
.
decoder
is
not
None
:
zs
,
_
=
self
.
decoder
(
ys_in
,
y_masks
,
encoder_out
,
bool
(
h_masks
),
self
.
encoder
.
segment_emb
(
speech_seg
ment
_pos
))
self
.
encoder
.
segment_emb
(
speech_seg_pos
))
speech_hidden_states
=
zs
else
:
speech_hidden_states
=
encoder_out
[:,
:
paddle
.
shape
(
batch
[
...
...
@@ -672,21 +666,21 @@ class MLMModel(nn.Layer):
else
:
after_outs
=
None
return
before_outs
,
after_outs
,
speech_pad_placeholder
,
batch
[
'masked_pos
ition
'
]
'masked_pos'
]
def
inference
(
self
,
speech
,
text
,
masked_pos
ition
,
masked_pos
,
speech_mask
,
text_mask
,
speech_seg
ment
_pos
,
text_seg
ment
_pos
,
span_b
oundar
y
,
speech_seg_pos
,
text_seg_pos
,
span_b
d
y
,
y_masks
=
None
,
speech_len
gth
s
=
None
,
text_len
gth
s
=
None
,
speech_lens
=
None
,
text_lens
=
None
,
feats
:
Optional
[
paddle
.
Tensor
]
=
None
,
spembs
:
Optional
[
paddle
.
Tensor
]
=
None
,
sids
:
Optional
[
paddle
.
Tensor
]
=
None
,
...
...
@@ -699,24 +693,24 @@ class MLMModel(nn.Layer):
batch
=
dict
(
speech_pad
=
speech
,
text_pad
=
text
,
masked_pos
ition
=
masked_position
,
masked_pos
=
masked_pos
,
speech_mask
=
speech_mask
,
text_mask
=
text_mask
,
speech_seg
ment_pos
=
speech_segment
_pos
,
text_seg
ment_pos
=
text_segment
_pos
,
)
speech_seg
_pos
=
speech_seg
_pos
,
text_seg
_pos
=
text_seg
_pos
,
)
# # inference with teacher forcing
# hs, h_masks = self.encoder(**batch)
outs
=
[
batch
[
'speech_pad'
][:,
:
span_b
oundar
y
[
0
]]]
outs
=
[
batch
[
'speech_pad'
][:,
:
span_b
d
y
[
0
]]]
z_cache
=
None
if
use_teacher_forcing
:
before
,
zs
,
_
,
_
=
self
.
forward
(
batch
,
speech_seg
ment
_pos
,
y_masks
=
y_masks
)
batch
,
speech_seg_pos
,
y_masks
=
y_masks
)
if
zs
is
None
:
zs
=
before
outs
+=
[
zs
[
0
][
span_b
oundary
[
0
]:
span_boundar
y
[
1
]]]
outs
+=
[
batch
[
'speech_pad'
][:,
span_b
oundar
y
[
1
]:]]
outs
+=
[
zs
[
0
][
span_b
dy
[
0
]:
span_bd
y
[
1
]]]
outs
+=
[
batch
[
'speech_pad'
][:,
span_b
d
y
[
1
]:]]
return
dict
(
feat_gen
=
outs
)
return
None
...
...
@@ -733,7 +727,7 @@ class MLMModel(nn.Layer):
class
MLMEncAsDecoderModel
(
MLMModel
):
def
forward
(
self
,
batch
,
speech_seg
ment
_pos
,
y_masks
=
None
):
def
forward
(
self
,
batch
,
speech_seg_pos
,
y_masks
=
None
):
# feats: (Batch, Length, Dim)
# -> encoder_out: (Batch, Length2, Dim2)
speech_pad_placeholder
=
batch
[
'speech_pad'
]
...
...
@@ -756,7 +750,7 @@ class MLMEncAsDecoderModel(MLMModel):
else
:
after_outs
=
None
return
before_outs
,
after_outs
,
speech_pad_placeholder
,
batch
[
'masked_pos
ition
'
]
'masked_pos'
]
class
MLMDualMaksingModel
(
MLMModel
):
...
...
@@ -767,9 +761,9 @@ class MLMDualMaksingModel(MLMModel):
batch
):
xs_pad
=
batch
[
'speech_pad'
]
text_pad
=
batch
[
'text_pad'
]
masked_pos
ition
=
batch
[
'masked_position
'
]
text_masked_pos
ition
=
batch
[
'text_masked_position
'
]
mlm_loss_pos
ition
=
masked_position
>
0
masked_pos
=
batch
[
'masked_pos
'
]
text_masked_pos
=
batch
[
'text_masked_pos
'
]
mlm_loss_pos
=
masked_pos
>
0
loss
=
paddle
.
sum
(
self
.
l1_loss_func
(
paddle
.
reshape
(
before_outs
,
(
-
1
,
self
.
odim
)),
...
...
@@ -782,19 +776,17 @@ class MLMDualMaksingModel(MLMModel):
paddle
.
reshape
(
xs_pad
,
(
-
1
,
self
.
odim
))),
axis
=-
1
)
loss_mlm
=
paddle
.
sum
((
loss
*
paddle
.
reshape
(
mlm_loss_pos
ition
,
[
-
1
])))
/
paddle
.
sum
((
mlm_loss_position
)
+
1e-10
)
mlm_loss_pos
,
[
-
1
])))
/
paddle
.
sum
((
mlm_loss_pos
)
+
1e-10
)
loss_text
=
paddle
.
sum
((
self
.
text_mlm_loss
(
paddle
.
reshape
(
text_outs
,
(
-
1
,
self
.
vocab_size
)),
paddle
.
reshape
(
text_pad
,
(
-
1
)))
*
paddle
.
reshape
(
text_masked_position
,
(
-
1
))))
/
paddle
.
sum
((
text_masked_position
)
+
1e-10
)
text_masked_pos
,
(
-
1
))))
/
paddle
.
sum
((
text_masked_pos
)
+
1e-10
)
return
loss_mlm
,
loss_text
def
forward
(
self
,
batch
,
speech_seg
ment
_pos
,
y_masks
=
None
):
def
forward
(
self
,
batch
,
speech_seg_pos
,
y_masks
=
None
):
# feats: (Batch, Length, Dim)
# -> encoder_out: (Batch, Length2, Dim2)
speech_pad_placeholder
=
batch
[
'speech_pad'
]
encoder_out
,
h_masks
=
self
.
encoder
(
**
batch
)
# segment_emb
if
self
.
decoder
is
not
None
:
zs
,
_
=
self
.
decoder
(
encoder_out
,
h_masks
)
...
...
@@ -819,7 +811,7 @@ class MLMDualMaksingModel(MLMModel):
[
0
,
2
,
1
])
else
:
after_outs
=
None
return
before_outs
,
after_outs
,
text_outs
,
None
#, speech_pad_placeholder, batch['masked_pos
ition'],batch['text_masked_position
']
return
before_outs
,
after_outs
,
text_outs
,
None
#, speech_pad_placeholder, batch['masked_pos
'],batch['text_masked_pos
']
def
build_model_from_file
(
config_file
,
model_file
):
...
...
ernie-sat/paddlespeech/t2s/modules/nets_utils.py
浏览文件 @
76b654cb
...
...
@@ -38,7 +38,7 @@ def pad_list(xs, pad_value):
"""
n_batch
=
len
(
xs
)
max_len
=
max
(
x
.
shape
[
0
]
for
x
in
xs
)
pad
=
paddle
.
full
([
n_batch
,
max_len
,
*
xs
[
0
].
shape
[
1
:]],
pad_value
)
pad
=
paddle
.
full
([
n_batch
,
max_len
,
*
xs
[
0
].
shape
[
1
:]],
pad_value
,
dtype
=
xs
[
0
].
dtype
)
for
i
in
range
(
n_batch
):
pad
[
i
,
:
xs
[
i
].
shape
[
0
]]
=
xs
[
i
]
...
...
@@ -46,13 +46,18 @@ def pad_list(xs, pad_value):
return
pad
def
make_pad_mask
(
lengths
,
length_dim
=-
1
):
def
make_pad_mask
(
lengths
,
xs
=
None
,
length_dim
=-
1
):
"""Make mask tensor containing indices of padded part.
Args:
lengths (Tensor(int64)): Batch of lengths (B,).
xs (Tensor, optional): The reference tensor.
If set, masks will be the same shape as this tensor.
length_dim (int, optional): Dimension indicator of the above tensor.
See the example.
Returns:
Returns:
Tensor(bool): Mask tensor containing indices of padded part bool.
Examples:
...
...
@@ -61,23 +66,98 @@ def make_pad_mask(lengths, length_dim=-1):
>>> lengths = [5, 3, 2]
>>> make_non_pad_mask(lengths)
masks = [[0, 0, 0, 0 ,0],
[0, 0, 0, 1, 1],
[0, 0, 1, 1, 1]]
[0, 0, 0, 1, 1],
[0, 0, 1, 1, 1]]
With the reference tensor.
>>> xs = paddle.zeros((3, 2, 4))
>>> make_pad_mask(lengths, xs)
tensor([[[0, 0, 0, 0],
[0, 0, 0, 0]],
[[0, 0, 0, 1],
[0, 0, 0, 1]],
[[0, 0, 1, 1],
[0, 0, 1, 1]]])
>>> xs = paddle.zeros((3, 2, 6))
>>> make_pad_mask(lengths, xs)
tensor([[[0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 1]],
[[0, 0, 0, 1, 1, 1],
[0, 0, 0, 1, 1, 1]],
[[0, 0, 1, 1, 1, 1],
[0, 0, 1, 1, 1, 1]]])
With the reference tensor and dimension indicator.
>>> xs = paddle.zeros((3, 6, 6))
>>> make_pad_mask(lengths, xs, 1)
tensor([[[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1]],
[[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1]],
[[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1]]])
>>> make_pad_mask(lengths, xs, 2)
tensor([[[0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 1]],
[[0, 0, 0, 1, 1, 1],
[0, 0, 0, 1, 1, 1],
[0, 0, 0, 1, 1, 1],
[0, 0, 0, 1, 1, 1],
[0, 0, 0, 1, 1, 1],
[0, 0, 0, 1, 1, 1]],
[[0, 0, 1, 1, 1, 1],
[0, 0, 1, 1, 1, 1],
[0, 0, 1, 1, 1, 1],
[0, 0, 1, 1, 1, 1],
[0, 0, 1, 1, 1, 1],
[0, 0, 1, 1, 1, 1]]],)
"""
if
length_dim
==
0
:
raise
ValueError
(
"length_dim cannot be 0: {}"
.
format
(
length_dim
))
bs
=
paddle
.
shape
(
lengths
)[
0
]
maxlen
=
lengths
.
max
()
if
xs
is
None
:
maxlen
=
lengths
.
max
()
else
:
maxlen
=
paddle
.
shape
(
xs
)[
length_dim
]
seq_range
=
paddle
.
arange
(
0
,
maxlen
,
dtype
=
paddle
.
int64
)
seq_range_expand
=
seq_range
.
unsqueeze
(
0
).
expand
([
bs
,
maxlen
])
seq_length_expand
=
lengths
.
unsqueeze
(
-
1
)
mask
=
seq_range_expand
>=
seq_length_expand
return
mask
if
xs
is
not
None
:
assert
paddle
.
shape
(
xs
)[
0
]
==
bs
,
(
paddle
.
shape
(
xs
)[
0
],
bs
)
if
length_dim
<
0
:
length_dim
=
len
(
paddle
.
shape
(
xs
))
+
length_dim
# ind = (:, None, ..., None, :, , None, ..., None)
ind
=
tuple
(
slice
(
None
)
if
i
in
(
0
,
length_dim
)
else
None
for
i
in
range
(
len
(
paddle
.
shape
(
xs
))))
mask
=
paddle
.
expand
(
mask
[
ind
],
paddle
.
shape
(
xs
))
return
mask
def
make_non_pad_mask
(
lengths
,
length_dim
=-
1
):
def
make_non_pad_mask
(
lengths
,
xs
=
None
,
length_dim
=-
1
):
"""Make mask tensor containing indices of non-padded part.
Args:
...
...
@@ -90,16 +170,78 @@ def make_non_pad_mask(lengths, length_dim=-1):
Returns:
Tensor(bool): mask tensor containing indices of padded part bool.
Examples:
Examples:
With only lengths.
>>> lengths = [5, 3, 2]
>>> make_non_pad_mask(lengths)
masks = [[1, 1, 1, 1 ,1],
[1, 1, 1, 0, 0],
[1, 1, 0, 0, 0]]
[1, 1, 1, 0, 0],
[1, 1, 0, 0, 0]]
With the reference tensor.
>>> xs = paddle.zeros((3, 2, 4))
>>> make_non_pad_mask(lengths, xs)
tensor([[[1, 1, 1, 1],
[1, 1, 1, 1]],
[[1, 1, 1, 0],
[1, 1, 1, 0]],
[[1, 1, 0, 0],
[1, 1, 0, 0]]])
>>> xs = paddle.zeros((3, 2, 6))
>>> make_non_pad_mask(lengths, xs)
tensor([[[1, 1, 1, 1, 1, 0],
[1, 1, 1, 1, 1, 0]],
[[1, 1, 1, 0, 0, 0],
[1, 1, 1, 0, 0, 0]],
[[1, 1, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0]]])
With the reference tensor and dimension indicator.
>>> xs = paddle.zeros((3, 6, 6))
>>> make_non_pad_mask(lengths, xs, 1)
tensor([[[1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 0]],
[[1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0]],
[[1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0]]])
>>> make_non_pad_mask(lengths, xs, 2)
tensor([[[1, 1, 1, 1, 1, 0],
[1, 1, 1, 1, 1, 0],
[1, 1, 1, 1, 1, 0],
[1, 1, 1, 1, 1, 0],
[1, 1, 1, 1, 1, 0],
[1, 1, 1, 1, 1, 0]],
[[1, 1, 1, 0, 0, 0],
[1, 1, 1, 0, 0, 0],
[1, 1, 1, 0, 0, 0],
[1, 1, 1, 0, 0, 0],
[1, 1, 1, 0, 0, 0],
[1, 1, 1, 0, 0, 0]],
[[1, 1, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0]]])
"""
return
paddle
.
logical_not
(
make_pad_mask
(
lengths
,
length_dim
))
return
paddle
.
logical_not
(
make_pad_mask
(
lengths
,
xs
,
length_dim
))
def
initialize
(
model
:
nn
.
Layer
,
init
:
str
):
...
...
ernie-sat/run_clone_en_to_zh.sh
浏览文件 @
76b654cb
...
...
@@ -10,8 +10,8 @@ python inference.py \
--uid
=
Prompt_003_new
\
--new_str
=
'今天天气很好.'
\
--prefix
=
'./prompt/dev/'
\
--source_lang
uage
=
english
\
--target_lang
uage
=
chinese
\
--source_lang
=
english
\
--target_lang
=
chinese
\
--output_name
=
pred_clone.wav
\
--use_pt_vocoder
=
False
\
--voc
=
pwgan_aishell3
\
...
...
ernie-sat/run_gen_en.sh
浏览文件 @
76b654cb
...
...
@@ -9,8 +9,8 @@ python inference.py \
--uid
=
p299_096
\
--new_str
=
'I enjoy my life, do you?'
\
--prefix
=
'./prompt/dev/'
\
--source_lang
uage
=
english
\
--target_lang
uage
=
english
\
--source_lang
=
english
\
--target_lang
=
english
\
--output_name
=
pred_gen.wav
\
--use_pt_vocoder
=
False
\
--voc
=
pwgan_aishell3
\
...
...
ernie-sat/run_sedit_en.sh
浏览文件 @
76b654cb
...
...
@@ -10,8 +10,8 @@ python inference.py \
--uid
=
p243_new
\
--new_str
=
'for that reason cover is impossible to be given.'
\
--prefix
=
'./prompt/dev/'
\
--source_lang
uage
=
english
\
--target_lang
uage
=
english
\
--source_lang
=
english
\
--target_lang
=
english
\
--output_name
=
pred_edit.wav
\
--use_pt_vocoder
=
False
\
--voc
=
pwgan_aishell3
\
...
...
ernie-sat/sedit_arg_parser.py
浏览文件 @
76b654cb
...
...
@@ -80,10 +80,8 @@ def parse_args():
parser
.
add_argument
(
"--uid"
,
type
=
str
,
help
=
"uid"
)
parser
.
add_argument
(
"--new_str"
,
type
=
str
,
help
=
"new string"
)
parser
.
add_argument
(
"--prefix"
,
type
=
str
,
help
=
"prefix"
)
parser
.
add_argument
(
"--clone_prefix"
,
type
=
str
,
default
=
None
,
help
=
"clone prefix"
)
parser
.
add_argument
(
"--clone_uid"
,
type
=
str
,
default
=
None
,
help
=
"clone uid"
)
parser
.
add_argument
(
"--source_language"
,
type
=
str
,
help
=
"source language"
)
parser
.
add_argument
(
"--target_language"
,
type
=
str
,
help
=
"target language"
)
parser
.
add_argument
(
"--source_lang"
,
type
=
str
,
default
=
"english"
,
help
=
"source language"
)
parser
.
add_argument
(
"--target_lang"
,
type
=
str
,
default
=
"english"
,
help
=
"target language"
)
parser
.
add_argument
(
"--output_name"
,
type
=
str
,
help
=
"output name"
)
parser
.
add_argument
(
"--task_name"
,
type
=
str
,
help
=
"task name"
)
parser
.
add_argument
(
...
...
ernie-sat/tools/
parallel_wavegan_pretrained_vocoder
.py
→
ernie-sat/tools/
torch_pwgan
.py
浏览文件 @
76b654cb
...
...
@@ -9,7 +9,7 @@ import torch
import
yaml
class
ParallelWaveGANPretrainedVocoder
(
torch
.
nn
.
Module
):
class
TorchPWGAN
(
torch
.
nn
.
Module
):
"""Wrapper class to load the vocoder trained with parallel_wavegan repo."""
def
__init__
(
...
...
ernie-sat/utils.py
浏览文件 @
76b654cb
import
os
from
typing
import
List
from
typing
import
Optional
import
numpy
as
np
import
paddle
import
yaml
...
...
@@ -5,11 +9,8 @@ from sedit_arg_parser import parse_args
from
yacs.config
import
CfgNode
from
paddlespeech.s2t.utils.dynamic_import
import
dynamic_import
from
paddlespeech.t2s.exps.syn_utils
import
get_frontend
from
paddlespeech.t2s.exps.syn_utils
import
get_voc_inference
from
paddlespeech.t2s.modules.normalizer
import
ZScore
from
tools.parallel_wavegan_pretrained_vocoder
import
ParallelWaveGANPretrainedVocoder
# new add
from
tools.torch_pwgan
import
TorchPWGAN
model_alias
=
{
# acoustic model
...
...
@@ -25,6 +26,10 @@ model_alias = {
"paddlespeech.t2s.models.tacotron2:Tacotron2"
,
"tacotron2_inference"
:
"paddlespeech.t2s.models.tacotron2:Tacotron2Inference"
,
"pwgan"
:
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator"
,
"pwgan_inference"
:
"paddlespeech.t2s.models.parallel_wavegan:PWGInference"
,
}
...
...
@@ -43,60 +48,65 @@ def build_vocoder_from_file(
# Build vocoder
if
str
(
vocoder_file
).
endswith
(
".pkl"
):
# If the extension is ".pkl", the model is trained with parallel_wavegan
vocoder
=
ParallelWaveGANPretrainedVocoder
(
vocoder_file
,
vocoder_config_file
)
vocoder
=
TorchPWGAN
(
vocoder_file
,
vocoder_config_file
)
return
vocoder
.
to
(
device
)
else
:
raise
ValueError
(
f
"
{
vocoder_file
}
is not supported format."
)
def
get_voc_out
(
mel
,
target_lang
uage
=
"chinese"
):
def
get_voc_out
(
mel
,
target_lang
:
str
=
"chinese"
):
# vocoder
args
=
parse_args
()
assert
target_lang
uage
==
"chinese"
or
target_language
==
"english"
,
"In get_voc_out function, target_language
is illegal..."
assert
target_lang
==
"chinese"
or
target_lang
==
"english"
,
"In get_voc_out function, target_lang
is illegal..."
# print("current vocoder: ", args.voc)
with
open
(
args
.
voc_config
)
as
f
:
voc_config
=
CfgNode
(
yaml
.
safe_load
(
f
))
# print(voc_config)
voc_inference
=
get_voc_inference
(
args
,
voc_config
)
voc_inference
=
voc_inference
=
get_voc_inference
(
voc
=
args
.
voc
,
voc_config
=
voc_config
,
voc_ckpt
=
args
.
voc_ckpt
,
voc_stat
=
args
.
voc_stat
)
mel
=
paddle
.
to_tensor
(
mel
)
# print("masked_mel: ", mel.shape)
with
paddle
.
no_grad
():
wav
=
voc_inference
(
mel
)
# print("shepe of wav (time x n_channels):%s"%wav.shape)
return
np
.
squeeze
(
wav
)
# dygraph
def
get_am_inference
(
args
,
am_config
):
with
open
(
args
.
phones_dict
,
"r"
)
as
f
:
def
get_am_inference
(
am
:
str
=
'fastspeech2_csmsc'
,
am_config
:
CfgNode
=
None
,
am_ckpt
:
Optional
[
os
.
PathLike
]
=
None
,
am_stat
:
Optional
[
os
.
PathLike
]
=
None
,
phones_dict
:
Optional
[
os
.
PathLike
]
=
None
,
tones_dict
:
Optional
[
os
.
PathLike
]
=
None
,
speaker_dict
:
Optional
[
os
.
PathLike
]
=
None
,
return_am
:
bool
=
False
):
with
open
(
phones_dict
,
"r"
)
as
f
:
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
vocab_size
=
len
(
phn_id
)
#
print("vocab_size:", vocab_size)
print
(
"vocab_size:"
,
vocab_size
)
tone_size
=
None
if
'tones_dict'
in
args
and
args
.
tones_dict
:
with
open
(
args
.
tones_dict
,
"r"
)
as
f
:
if
tones_dict
is
not
None
:
with
open
(
tones_dict
,
"r"
)
as
f
:
tone_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
tone_size
=
len
(
tone_id
)
print
(
"tone_size:"
,
tone_size
)
spk_num
=
None
if
'speaker_dict'
in
args
and
args
.
speaker_dict
:
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
if
speaker_dict
is
not
None
:
with
open
(
speaker_dict
,
'rt'
)
as
f
:
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
spk_num
=
len
(
spk_id
)
print
(
"spk_num:"
,
spk_num
)
odim
=
am_config
.
n_mels
# model: {model_name}_{dataset}
am_name
=
a
rgs
.
am
[:
args
.
am
.
rindex
(
'_'
)]
am_dataset
=
a
rgs
.
am
[
args
.
am
.
rindex
(
'_'
)
+
1
:]
am_name
=
a
m
[:
am
.
rindex
(
'_'
)]
am_dataset
=
a
m
[
am
.
rindex
(
'_'
)
+
1
:]
am_class
=
dynamic_import
(
am_name
,
model_alias
)
am_inference_class
=
dynamic_import
(
am_name
+
'_inference'
,
model_alias
)
...
...
@@ -113,39 +123,61 @@ def get_am_inference(args, am_config):
elif
am_name
==
'tacotron2'
:
am
=
am_class
(
idim
=
vocab_size
,
odim
=
odim
,
**
am_config
[
"model"
])
am
.
set_state_dict
(
paddle
.
load
(
a
rgs
.
a
m_ckpt
)[
"main_params"
])
am
.
set_state_dict
(
paddle
.
load
(
am_ckpt
)[
"main_params"
])
am
.
eval
()
am_mu
,
am_std
=
np
.
load
(
a
rgs
.
a
m_stat
)
am_mu
,
am_std
=
np
.
load
(
am_stat
)
am_mu
=
paddle
.
to_tensor
(
am_mu
)
am_std
=
paddle
.
to_tensor
(
am_std
)
am_normalizer
=
ZScore
(
am_mu
,
am_std
)
am_inference
=
am_inference_class
(
am_normalizer
,
am
)
am_inference
.
eval
()
print
(
"acoustic model done!"
)
return
am
,
am_inference
,
am_name
,
am_dataset
,
phn_id
if
return_am
:
return
am_inference
,
am
else
:
return
am_inference
def
evaluate_durations
(
phns
,
target_language
=
"chinese"
,
fs
=
24000
,
hop_length
=
300
):
def
get_voc_inference
(
voc
:
str
=
'pwgan_csmsc'
,
voc_config
:
Optional
[
os
.
PathLike
]
=
None
,
voc_ckpt
:
Optional
[
os
.
PathLike
]
=
None
,
voc_stat
:
Optional
[
os
.
PathLike
]
=
None
,
):
# model: {model_name}_{dataset}
voc_name
=
voc
[:
voc
.
rindex
(
'_'
)]
voc_class
=
dynamic_import
(
voc_name
,
model_alias
)
voc_inference_class
=
dynamic_import
(
voc_name
+
'_inference'
,
model_alias
)
if
voc_name
!=
'wavernn'
:
voc
=
voc_class
(
**
voc_config
[
"generator_params"
])
voc
.
set_state_dict
(
paddle
.
load
(
voc_ckpt
)[
"generator_params"
])
voc
.
remove_weight_norm
()
voc
.
eval
()
else
:
voc
=
voc_class
(
**
voc_config
[
"model"
])
voc
.
set_state_dict
(
paddle
.
load
(
voc_ckpt
)[
"main_params"
])
voc
.
eval
()
voc_mu
,
voc_std
=
np
.
load
(
voc_stat
)
voc_mu
=
paddle
.
to_tensor
(
voc_mu
)
voc_std
=
paddle
.
to_tensor
(
voc_std
)
voc_normalizer
=
ZScore
(
voc_mu
,
voc_std
)
voc_inference
=
voc_inference_class
(
voc_normalizer
,
voc
)
voc_inference
.
eval
()
print
(
"voc done!"
)
return
voc_inference
def
evaluate_durations
(
phns
:
List
[
str
],
target_lang
:
str
=
"chinese"
,
fs
:
int
=
24000
,
hop_length
:
int
=
300
):
args
=
parse_args
()
if
target_lang
uage
==
'english'
:
if
target_lang
==
'english'
:
args
.
lang
=
'en'
args
.
am
=
"fastspeech2_ljspeech"
args
.
am_config
=
"download/fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml"
args
.
am_ckpt
=
"download/fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz"
args
.
am_stat
=
"download/fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy"
args
.
phones_dict
=
"download/fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt"
elif
target_lang
uage
==
'chinese'
:
elif
target_lang
==
'chinese'
:
args
.
lang
=
'zh'
args
.
am
=
"fastspeech2_csmsc"
args
.
am_config
=
"download/fastspeech2_conformer_baker_ckpt_0.5/conformer.yaml"
args
.
am_ckpt
=
"download/fastspeech2_conformer_baker_ckpt_0.5/snapshot_iter_76000.pdz"
args
.
am_stat
=
"download/fastspeech2_conformer_baker_ckpt_0.5/speech_stats.npy"
args
.
phones_dict
=
"download/fastspeech2_conformer_baker_ckpt_0.5/phone_id_map.txt"
# args = parser.parse_args(args=[])
if
args
.
ngpu
==
0
:
...
...
@@ -155,23 +187,28 @@ def evaluate_durations(phns,
else
:
print
(
"ngpu should >= 0 !"
)
assert
target_lang
uage
==
"chinese"
or
target_language
==
"english"
,
"In evaluate_durations function, target_language
is illegal..."
assert
target_lang
==
"chinese"
or
target_lang
==
"english"
,
"In evaluate_durations function, target_lang
is illegal..."
# Init body.
with
open
(
args
.
am_config
)
as
f
:
am_config
=
CfgNode
(
yaml
.
safe_load
(
f
))
# print("========Config========")
# print(am_config)
# print("---------------------")
# acoustic model
am
,
am_inference
,
am_name
,
am_dataset
,
phn_id
=
get_am_inference
(
args
,
am_config
)
am_inference
,
am
=
get_am_inference
(
am
=
args
.
am
,
am_config
=
am_config
,
am_ckpt
=
args
.
am_ckpt
,
am_stat
=
args
.
am_stat
,
phones_dict
=
args
.
phones_dict
,
tones_dict
=
args
.
tones_dict
,
speaker_dict
=
args
.
speaker_dict
,
return_am
=
True
)
torch_phns
=
phns
vocab_phones
=
{}
with
open
(
args
.
phones_dict
,
"r"
)
as
f
:
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
for
tone
,
id
in
phn_id
:
vocab_phones
[
tone
]
=
int
(
id
)
# print("vocab_phones: ", len(vocab_phones))
vocab_size
=
len
(
vocab_phones
)
phonemes
=
[
phn
if
phn
in
vocab_phones
else
"sp"
for
phn
in
torch_phns
]
...
...
@@ -185,59 +222,3 @@ def evaluate_durations(phns,
phoneme_durations_new
=
pre_d_outs
*
hop_length
/
fs
phoneme_durations_new
=
phoneme_durations_new
.
tolist
()[:
-
1
]
return
phoneme_durations_new
def
sentence2phns
(
sentence
,
target_language
=
"en"
):
args
=
parse_args
()
if
target_language
==
'en'
:
args
.
lang
=
'en'
args
.
phones_dict
=
"download/fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt"
elif
target_language
==
'zh'
:
args
.
lang
=
'zh'
args
.
phones_dict
=
"download/fastspeech2_conformer_baker_ckpt_0.5/phone_id_map.txt"
else
:
print
(
"target_language should in {'zh', 'en'}!"
)
frontend
=
get_frontend
(
args
)
merge_sentences
=
True
get_tone_ids
=
False
if
target_language
==
'zh'
:
input_ids
=
frontend
.
get_input_ids
(
sentence
,
merge_sentences
=
merge_sentences
,
get_tone_ids
=
get_tone_ids
,
print_info
=
False
)
phone_ids
=
input_ids
[
"phone_ids"
]
phonemes
=
frontend
.
get_phonemes
(
sentence
,
merge_sentences
=
merge_sentences
,
print_info
=
False
)
return
phonemes
[
0
],
input_ids
[
"phone_ids"
][
0
]
elif
target_language
==
'en'
:
phonemes
=
frontend
.
phoneticize
(
sentence
)
input_ids
=
frontend
.
get_input_ids
(
sentence
,
merge_sentences
=
merge_sentences
)
phone_ids
=
input_ids
[
"phone_ids"
]
phones_list
=
[]
vocab_phones
=
{}
punc
=
":,;。?!“”‘’':,;.?!"
with
open
(
args
.
phones_dict
,
'rt'
)
as
f
:
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
for
phn
,
id
in
phn_id
:
vocab_phones
[
phn
]
=
int
(
id
)
phones
=
phonemes
[
1
:
-
1
]
phones
=
[
phn
for
phn
in
phones
if
not
phn
.
isspace
()]
# replace unk phone with sp
phones
=
[
phn
if
(
phn
in
vocab_phones
and
phn
not
in
punc
)
else
"sp"
for
phn
in
phones
]
phones_list
.
append
(
phones
)
return
phones_list
[
0
],
input_ids
[
"phone_ids"
][
0
]
else
:
print
(
"lang should in {'zh', 'en'}!"
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录