Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
42f2186d
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
9 个月 前同步成功
通知
200
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
42f2186d
编写于
6月 07, 2023
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
more comment on tts frontend
上级
8aa9790c
变更
9
展开全部
隐藏空白更改
内联
并排
Showing
9 changed file
with
409 addition
and
200 deletion
+409
-200
paddlespeech/t2s/assets/__init__.py
paddlespeech/t2s/assets/__init__.py
+0
-0
paddlespeech/t2s/exps/syn_utils.py
paddlespeech/t2s/exps/syn_utils.py
+20
-13
paddlespeech/t2s/exps/synthesize_e2e.py
paddlespeech/t2s/exps/synthesize_e2e.py
+26
-0
paddlespeech/t2s/frontend/arpabet.py
paddlespeech/t2s/frontend/arpabet.py
+75
-45
paddlespeech/t2s/frontend/polyphonic.yaml
paddlespeech/t2s/frontend/polyphonic.yaml
+2
-1
paddlespeech/t2s/frontend/ssml/__init__.py
paddlespeech/t2s/frontend/ssml/__init__.py
+0
-0
paddlespeech/t2s/frontend/ssml/xml_processor.py
paddlespeech/t2s/frontend/ssml/xml_processor.py
+2
-1
paddlespeech/t2s/frontend/tone_sandhi.py
paddlespeech/t2s/frontend/tone_sandhi.py
+26
-16
paddlespeech/t2s/frontend/zh_frontend.py
paddlespeech/t2s/frontend/zh_frontend.py
+258
-124
未找到文件。
paddlespeech/t2s/assets/__init__.py
0 → 100644
浏览文件 @
42f2186d
paddlespeech/t2s/exps/syn_utils.py
浏览文件 @
42f2186d
...
...
@@ -99,14 +99,23 @@ def norm(data, mean, std):
return
(
data
-
mean
)
/
std
def
get_chunks
(
data
,
block_size
:
int
,
pad_size
:
int
):
data_len
=
data
.
shape
[
1
]
def
get_chunks
(
mel
,
chunk_size
:
int
,
pad_size
:
int
):
"""
Split mel by chunk size with left and right context.
Args:
mel (paddle.Tensor): mel spectrogram, shape (B, T, D)
chunk_size (int): chunk size
pad_size (int): size for left and right context.
"""
T
=
mel
.
shape
[
1
]
n
=
math
.
ceil
(
T
/
chunk_size
)
chunks
=
[]
n
=
math
.
ceil
(
data_len
/
block_size
)
for
i
in
range
(
n
):
start
=
max
(
0
,
i
*
bloc
k_size
-
pad_size
)
end
=
min
((
i
+
1
)
*
block_size
+
pad_size
,
data_len
)
chunks
.
append
(
data
[:,
start
:
end
,
:])
start
=
max
(
0
,
i
*
chun
k_size
-
pad_size
)
end
=
min
((
i
+
1
)
*
chunk_size
+
pad_size
,
T
)
chunks
.
append
(
mel
[:,
start
:
end
,
:])
return
chunks
...
...
@@ -117,14 +126,10 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
with
open
(
text_file
,
'rt'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
if
line
.
strip
()
!=
""
:
items
=
re
.
split
(
r
"\s+"
,
line
.
strip
(),
1
)
items
=
re
.
split
(
r
"\s+"
,
line
.
strip
(),
maxsplit
=
1
)
assert
len
(
items
)
==
2
utt_id
=
items
[
0
]
if
lang
in
{
'zh'
,
'canton'
}:
sentence
=
""
.
join
(
items
[
1
:])
elif
lang
==
'en'
:
sentence
=
" "
.
join
(
items
[
1
:])
elif
lang
==
'mix'
:
sentence
=
" "
.
join
(
items
[
1
:])
sentence
=
items
[
1
]
sentences
.
append
((
utt_id
,
sentence
))
return
sentences
...
...
@@ -319,6 +324,7 @@ def run_frontend(
input_ids
=
{}
if
text
.
strip
()
!=
""
and
re
.
match
(
r
".*?<speak>.*?</speak>.*"
,
text
,
re
.
DOTALL
):
# using ssml
input_ids
=
frontend
.
get_input_ids_ssml
(
text
,
merge_sentences
=
merge_sentences
,
...
...
@@ -359,6 +365,7 @@ def run_frontend(
outs
.
update
({
'is_slurs'
:
is_slurs
})
else
:
print
(
"lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!"
)
outs
.
update
({
'phone_ids'
:
phone_ids
})
return
outs
...
...
paddlespeech/t2s/exps/synthesize_e2e.py
浏览文件 @
42f2186d
...
...
@@ -13,6 +13,7 @@
# limitations under the License.
import
argparse
from
pathlib
import
Path
from
pprint
import
pprint
import
paddle
import
soundfile
as
sf
...
...
@@ -78,6 +79,7 @@ def evaluate(args):
# whether dygraph to static
if
args
.
inference_dir
:
print
(
"convert am and voc to static model."
)
# acoustic model
am_inference
=
am_to_static
(
am_inference
=
am_inference
,
...
...
@@ -92,6 +94,7 @@ def evaluate(args):
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
merge_sentences
=
False
# Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
# but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
...
...
@@ -102,12 +105,18 @@ def evaluate(args):
if
am_name
==
'speedyspeech'
:
get_tone_ids
=
True
# wav samples
N
=
0
# inference time cost
T
=
0
# [(uid, text), ]
if
am_name
==
'diffsinger'
:
sentences
=
get_sentences_svs
(
text_file
=
args
.
text
)
else
:
sentences
=
get_sentences
(
text_file
=
args
.
text
,
lang
=
args
.
lang
)
pprint
(
f
"inputs:
{
sentences
}
"
)
for
utt_id
,
sentence
in
sentences
:
with
timer
()
as
t
:
if
am_name
==
"diffsinger"
:
...
...
@@ -116,6 +125,8 @@ def evaluate(args):
else
:
text
=
sentence
svs_input
=
None
# frontend
frontend_dict
=
run_frontend
(
frontend
=
frontend
,
text
=
text
,
...
...
@@ -124,25 +135,33 @@ def evaluate(args):
lang
=
args
.
lang
,
svs_input
=
svs_input
)
phone_ids
=
frontend_dict
[
'phone_ids'
]
# pprint(f"process: {utt_id} {phone_ids}")
with
paddle
.
no_grad
():
flags
=
0
for
i
in
range
(
len
(
phone_ids
)):
# sub phone, split by `sp` or punctuation.
part_phone_ids
=
phone_ids
[
i
]
# acoustic model
if
am_name
==
'fastspeech2'
:
# multi speaker
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
,
"canton"
}:
# multi-speaker
spk_id
=
paddle
.
to_tensor
(
args
.
spk_id
)
mel
=
am_inference
(
part_phone_ids
,
spk_id
)
else
:
# single-speaker
mel
=
am_inference
(
part_phone_ids
)
elif
am_name
==
'speedyspeech'
:
part_tone_ids
=
frontend_dict
[
'tone_ids'
][
i
]
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
}:
# multi-speaker
spk_id
=
paddle
.
to_tensor
(
args
.
spk_id
)
mel
=
am_inference
(
part_phone_ids
,
part_tone_ids
,
spk_id
)
else
:
# single-speaker
mel
=
am_inference
(
part_phone_ids
,
part_tone_ids
)
elif
am_name
==
'tacotron2'
:
mel
=
am_inference
(
part_phone_ids
)
...
...
@@ -155,6 +174,7 @@ def evaluate(args):
note
=
part_note_ids
,
note_dur
=
part_note_durs
,
is_slur
=
part_is_slurs
,
)
# vocoder
wav
=
voc_inference
(
mel
)
if
flags
==
0
:
...
...
@@ -162,17 +182,23 @@ def evaluate(args):
flags
=
1
else
:
wav_all
=
paddle
.
concat
([
wav_all
,
wav
])
wav
=
wav_all
.
numpy
()
N
+=
wav
.
size
T
+=
t
.
elapse
# samples per second
speed
=
wav
.
size
/
t
.
elapse
# generate one second wav need `RTF` seconds
rtf
=
am_config
.
fs
/
speed
print
(
f
"
{
utt_id
}
, mel:
{
mel
.
shape
}
, wave:
{
wav
.
shape
}
, time:
{
t
.
elapse
}
s, Hz:
{
speed
}
, RTF:
{
rtf
}
."
)
sf
.
write
(
str
(
output_dir
/
(
utt_id
+
".wav"
)),
wav
,
samplerate
=
am_config
.
fs
)
print
(
f
"
{
utt_id
}
done!"
)
print
(
f
"generation speed:
{
N
/
T
}
Hz, RTF:
{
am_config
.
fs
/
(
N
/
T
)
}
"
)
...
...
paddlespeech/t2s/frontend/arpabet.py
浏览文件 @
42f2186d
...
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddlespeech.t2s.frontend.phonectic
import
Phonetics
"""
A phonology system with ARPABET symbols and limited punctuations. The G2P
conversion is done by g2p_en.
...
...
@@ -19,55 +18,68 @@ conversion is done by g2p_en.
Note that g2p_en does not handle words with hypen well. So make sure the input
sentence is first normalized.
"""
from
paddlespeech.t2s.frontend.vocab
import
Vocab
from
g2p_en
import
G2p
from
paddlespeech.t2s.frontend.phonectic
import
Phonetics
from
paddlespeech.t2s.frontend.vocab
import
Vocab
class
ARPABET
(
Phonetics
):
"""A phonology for English that uses ARPABET as the phoneme vocabulary.
"""A phonology for English that uses ARPABET without stress as the phoneme vocabulary.
47 symbols = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
The current phoneme set contains 39 phonemes, vowels carry a lexical stress marker:
0 — No stress
1 — Primary stress
2 — Secondary stress
Phoneme Set:
Phoneme Example Translation
------- ------- -----------
AA odd AA D
AE at AE T
AH hut HH AH T
AO ought AO T
AW cow K AW
AY hide HH AY D
B be B IY
CH cheese CH IY Z
D dee D IY
DH thee DH IY
EH Ed EH D
ER hurt HH ER T
EY ate EY T
F fee F IY
G green G R IY N
HH he HH IY
IH it IH T
IY eat IY T
JH gee JH IY
K key K IY
L lee L IY
M me M IY
N knee N IY
NG ping P IH NG
OW oat OW T
OY toy T OY
P pee P IY
R read R IY D
S sea S IY
SH she SH IY
T tea T IY
TH theta TH EY T AH
UH hood HH UH D
UW two T UW
V vee V IY
W we W IY
Y yield Y IY L D
Z zee Z IY
ZH seizure S IY ZH ER
See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
Phoneme Example Translation
------- ------- -----------
AA odd AA D
AE at AE T
AH hut HH AH T
AO ought AO T
AW cow K AW
AY hide HH AY D
B be B IY
CH cheese CH IY Z
D dee D IY
DH thee DH IY
EH Ed EH D
ER hurt HH ER T
EY ate EY T
F fee F IY
G green G R IY N
HH he HH IY
IH it IH T
IY eat IY T
JH gee JH IY
K key K IY
L lee L IY
M me M IY
N knee N IY
NG ping P IH NG
OW oat OW T
OY toy T OY
P pee P IY
R read R IY D
S sea S IY
SH she SH IY
T tea T IY
TH theta TH EY T AH
UH hood HH UH D
UW two T UW
V vee V IY
W we W IY
Y yield Y IY L D
Z zee Z IY
ZH seizure S IY ZH ER
"""
# 39 phonemes
phonemes
=
[
'AA'
,
'AE'
,
'AH'
,
'AO'
,
'AW'
,
'AY'
,
'B'
,
'CH'
,
'D'
,
'DH'
,
'EH'
,
'ER'
,
'EY'
,
'F'
,
'G'
,
'HH'
,
'IH'
,
'IY'
,
'JH'
,
'K'
,
'L'
,
'M'
,
'N'
,
'NG'
,
'OW'
,
...
...
@@ -76,6 +88,8 @@ class ARPABET(Phonetics):
]
punctuations
=
[
','
,
'.'
,
'?'
,
'!'
]
symbols
=
phonemes
+
punctuations
# vowels carry a lexical stress marker:
# 0 unstressed(无重音), 1 primary stress(主重音)和 2 secondary stress(次重音)
_stress_to_no_stress_
=
{
'AA0'
:
'AA'
,
'AA1'
:
'AA'
,
...
...
@@ -124,7 +138,12 @@ class ARPABET(Phonetics):
'UW2'
:
'UW'
}
def
__repr__
(
self
):
fmt
=
"ARPABETWithoutStress(phonemes: {}, punctuations: {})"
return
fmt
.
format
(
len
(
phonemes
),
punctuations
)
def
__init__
(
self
):
# https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
self
.
backend
=
G2p
()
self
.
vocab
=
Vocab
(
self
.
phonemes
+
self
.
punctuations
)
...
...
@@ -139,6 +158,7 @@ class ARPABET(Phonetics):
Returns:
List[str]: The list of pronunciation sequence.
"""
# g2p and remove vowel stress
phonemes
=
[
self
.
_remove_vowels
(
item
)
for
item
in
self
.
backend
(
sentence
)
]
...
...
@@ -158,6 +178,7 @@ class ARPABET(Phonetics):
Returns:
List[int]: The list of pronunciation id sequence.
"""
# phonemes to ids
ids
=
[
self
.
vocab
.
lookup
(
item
)
for
item
in
phonemes
]
return
ids
...
...
@@ -189,11 +210,16 @@ class ARPABET(Phonetics):
def
vocab_size
(
self
):
""" Vocab size.
"""
# 47 = 39 phones + 4 punctuations + 4 special tokens
# 47 = 39 phones + 4 punctuations + 4 special tokens
(<pad> <unk> <s> </s>)
return
len
(
self
.
vocab
)
class
ARPABETWithStress
(
Phonetics
):
"""
A phonology for English that uses ARPABET with stress as the phoneme vocabulary.
77 symbols = 69 phones + 4 punctuations + 4 special tokens
"""
phonemes
=
[
'AA0'
,
'AA1'
,
'AA2'
,
'AE0'
,
'AE1'
,
'AE2'
,
'AH0'
,
'AH1'
,
'AH2'
,
'AO0'
,
'AO1'
,
'AO2'
,
'AW0'
,
'AW1'
,
'AW2'
,
'AY0'
,
'AY1'
,
'AY2'
,
'B'
,
'CH'
,
'D'
,
...
...
@@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics):
punctuations
=
[
','
,
'.'
,
'?'
,
'!'
]
symbols
=
phonemes
+
punctuations
def
__repr__
(
self
):
fmt
=
"ARPABETWithStress(phonemes: {}, punctuations: {})"
return
fmt
.
format
(
len
(
phonemes
),
punctuations
)
def
__init__
(
self
):
self
.
backend
=
G2p
()
self
.
vocab
=
Vocab
(
self
.
phonemes
+
self
.
punctuations
)
...
...
paddlespeech/t2s/frontend/polyphonic.yaml
浏览文件 @
42f2186d
...
...
@@ -47,4 +47,5 @@ polyphonic:
恶行
:
[
'
e4'
,
'
xing2'
]
唉
:
[
'
ai4'
]
扎实
:
[
'
zha1'
,
'
shi2'
]
干将
:
[
'
gan4'
,
'
jiang4'
]
\ No newline at end of file
干将
:
[
'
gan4'
,
'
jiang4'
]
陈威行
:
[
'
chen2'
,
'
wei1'
,
'
hang2'
]
\ No newline at end of file
paddlespeech/t2s/ssml/__init__.py
→
paddlespeech/t2s/
frontend/
ssml/__init__.py
浏览文件 @
42f2186d
文件已移动
paddlespeech/t2s/ssml/xml_processor.py
→
paddlespeech/t2s/
frontend/
ssml/xml_processor.py
浏览文件 @
42f2186d
...
...
@@ -90,13 +90,14 @@ class MixTextProcessor():
dom
=
DomXml
(
in_xml
)
tags
=
dom
.
get_text_and_sayas_tags
()
ctlist
.
extend
(
tags
)
ctlist
.
append
(
after_xml
)
return
ctlist
else
:
ctlist
.
append
(
mixstr
)
return
ctlist
class
DomXml
():
def
__init__
(
self
,
xmlstr
):
self
.
tdom
=
parseString
(
xmlstr
)
#Document
...
...
paddlespeech/t2s/frontend/tone_sandhi.py
浏览文件 @
42f2186d
...
...
@@ -20,6 +20,9 @@ from pypinyin import Style
class
ToneSandhi
():
def
__repr__
(
self
):
return
"MandarinToneSandhi"
def
__init__
(
self
):
self
.
must_neural_tone_words
=
{
'麻烦'
,
'麻利'
,
'鸳鸯'
,
'高粱'
,
'骨头'
,
'骆驼'
,
'马虎'
,
'首饰'
,
'馒头'
,
'馄饨'
,
'风筝'
,
...
...
@@ -69,6 +72,19 @@ class ToneSandhi():
}
self
.
punc
=
":,;。?!“”‘’':,;.?!"
def
_split_word
(
self
,
word
:
str
)
->
List
[
str
]:
word_list
=
jieba
.
cut_for_search
(
word
)
word_list
=
sorted
(
word_list
,
key
=
lambda
i
:
len
(
i
),
reverse
=
False
)
first_subword
=
word_list
[
0
]
first_begin_idx
=
word
.
find
(
first_subword
)
if
first_begin_idx
==
0
:
second_subword
=
word
[
len
(
first_subword
):]
new_word_list
=
[
first_subword
,
second_subword
]
else
:
second_subword
=
word
[:
-
len
(
first_subword
)]
new_word_list
=
[
second_subword
,
first_subword
]
return
new_word_list
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
# e.g.
# word: "家里"
...
...
@@ -154,18 +170,8 @@ class ToneSandhi():
finals
[
i
]
=
finals
[
i
][:
-
1
]
+
"4"
return
finals
def
_split_word
(
self
,
word
:
str
)
->
List
[
str
]:
word_list
=
jieba
.
cut_for_search
(
word
)
word_list
=
sorted
(
word_list
,
key
=
lambda
i
:
len
(
i
),
reverse
=
False
)
first_subword
=
word_list
[
0
]
first_begin_idx
=
word
.
find
(
first_subword
)
if
first_begin_idx
==
0
:
second_subword
=
word
[
len
(
first_subword
):]
new_word_list
=
[
first_subword
,
second_subword
]
else
:
second_subword
=
word
[:
-
len
(
first_subword
)]
new_word_list
=
[
second_subword
,
first_subword
]
return
new_word_list
def
_all_tone_three
(
self
,
finals
:
List
[
str
])
->
bool
:
return
all
(
x
[
-
1
]
==
"3"
for
x
in
finals
)
def
_three_sandhi
(
self
,
word
:
str
,
finals
:
List
[
str
])
->
List
[
str
]:
...
...
@@ -207,9 +213,6 @@ class ToneSandhi():
return
finals
def
_all_tone_three
(
self
,
finals
:
List
[
str
])
->
bool
:
return
all
(
x
[
-
1
]
==
"3"
for
x
in
finals
)
# merge "不" and the word behind it
# if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
def
_merge_bu
(
self
,
seg
:
List
[
Tuple
[
str
,
str
]])
->
List
[
Tuple
[
str
,
str
]]:
...
...
@@ -336,6 +339,9 @@ class ToneSandhi():
def
pre_merge_for_modify
(
self
,
seg
:
List
[
Tuple
[
str
,
str
]])
->
List
[
Tuple
[
str
,
str
]]:
"""
seg: [(word, pos), ...]
"""
seg
=
self
.
_merge_bu
(
seg
)
seg
=
self
.
_merge_yi
(
seg
)
seg
=
self
.
_merge_reduplication
(
seg
)
...
...
@@ -346,7 +352,11 @@ class ToneSandhi():
def
modified_tone
(
self
,
word
:
str
,
pos
:
str
,
finals
:
List
[
str
])
->
List
[
str
]:
"""
word: 分词
pos: 词性
finals: 带调韵母, [final1, ..., finaln]
"""
finals
=
self
.
_bu_sandhi
(
word
,
finals
)
finals
=
self
.
_yi_sandhi
(
word
,
finals
)
finals
=
self
.
_neural_sandhi
(
word
,
pos
,
finals
)
...
...
paddlespeech/t2s/frontend/zh_frontend.py
浏览文件 @
42f2186d
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录