Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
42f2186d
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
42f2186d
编写于
6月 07, 2023
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
more comment on tts frontend
上级
8aa9790c
变更
9
展开全部
隐藏空白更改
内联
并排
Showing
9 changed file
with
409 addition
and
200 deletion
+409
-200
paddlespeech/t2s/assets/__init__.py
paddlespeech/t2s/assets/__init__.py
+0
-0
paddlespeech/t2s/exps/syn_utils.py
paddlespeech/t2s/exps/syn_utils.py
+20
-13
paddlespeech/t2s/exps/synthesize_e2e.py
paddlespeech/t2s/exps/synthesize_e2e.py
+26
-0
paddlespeech/t2s/frontend/arpabet.py
paddlespeech/t2s/frontend/arpabet.py
+75
-45
paddlespeech/t2s/frontend/polyphonic.yaml
paddlespeech/t2s/frontend/polyphonic.yaml
+2
-1
paddlespeech/t2s/frontend/ssml/__init__.py
paddlespeech/t2s/frontend/ssml/__init__.py
+0
-0
paddlespeech/t2s/frontend/ssml/xml_processor.py
paddlespeech/t2s/frontend/ssml/xml_processor.py
+2
-1
paddlespeech/t2s/frontend/tone_sandhi.py
paddlespeech/t2s/frontend/tone_sandhi.py
+26
-16
paddlespeech/t2s/frontend/zh_frontend.py
paddlespeech/t2s/frontend/zh_frontend.py
+258
-124
未找到文件。
paddlespeech/t2s/assets/__init__.py
0 → 100644
浏览文件 @
42f2186d
paddlespeech/t2s/exps/syn_utils.py
浏览文件 @
42f2186d
...
@@ -99,14 +99,23 @@ def norm(data, mean, std):
...
@@ -99,14 +99,23 @@ def norm(data, mean, std):
return
(
data
-
mean
)
/
std
return
(
data
-
mean
)
/
std
def
get_chunks
(
data
,
block_size
:
int
,
pad_size
:
int
):
def
get_chunks
(
mel
,
chunk_size
:
int
,
pad_size
:
int
):
data_len
=
data
.
shape
[
1
]
"""
Split mel by chunk size with left and right context.
Args:
mel (paddle.Tensor): mel spectrogram, shape (B, T, D)
chunk_size (int): chunk size
pad_size (int): size for left and right context.
"""
T
=
mel
.
shape
[
1
]
n
=
math
.
ceil
(
T
/
chunk_size
)
chunks
=
[]
chunks
=
[]
n
=
math
.
ceil
(
data_len
/
block_size
)
for
i
in
range
(
n
):
for
i
in
range
(
n
):
start
=
max
(
0
,
i
*
bloc
k_size
-
pad_size
)
start
=
max
(
0
,
i
*
chun
k_size
-
pad_size
)
end
=
min
((
i
+
1
)
*
block_size
+
pad_size
,
data_len
)
end
=
min
((
i
+
1
)
*
chunk_size
+
pad_size
,
T
)
chunks
.
append
(
data
[:,
start
:
end
,
:])
chunks
.
append
(
mel
[:,
start
:
end
,
:])
return
chunks
return
chunks
...
@@ -117,14 +126,10 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
...
@@ -117,14 +126,10 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
with
open
(
text_file
,
'rt'
,
encoding
=
'utf-8'
)
as
f
:
with
open
(
text_file
,
'rt'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
for
line
in
f
:
if
line
.
strip
()
!=
""
:
if
line
.
strip
()
!=
""
:
items
=
re
.
split
(
r
"\s+"
,
line
.
strip
(),
1
)
items
=
re
.
split
(
r
"\s+"
,
line
.
strip
(),
maxsplit
=
1
)
assert
len
(
items
)
==
2
utt_id
=
items
[
0
]
utt_id
=
items
[
0
]
if
lang
in
{
'zh'
,
'canton'
}:
sentence
=
items
[
1
]
sentence
=
""
.
join
(
items
[
1
:])
elif
lang
==
'en'
:
sentence
=
" "
.
join
(
items
[
1
:])
elif
lang
==
'mix'
:
sentence
=
" "
.
join
(
items
[
1
:])
sentences
.
append
((
utt_id
,
sentence
))
sentences
.
append
((
utt_id
,
sentence
))
return
sentences
return
sentences
...
@@ -319,6 +324,7 @@ def run_frontend(
...
@@ -319,6 +324,7 @@ def run_frontend(
input_ids
=
{}
input_ids
=
{}
if
text
.
strip
()
!=
""
and
re
.
match
(
r
".*?<speak>.*?</speak>.*"
,
text
,
if
text
.
strip
()
!=
""
and
re
.
match
(
r
".*?<speak>.*?</speak>.*"
,
text
,
re
.
DOTALL
):
re
.
DOTALL
):
# using ssml
input_ids
=
frontend
.
get_input_ids_ssml
(
input_ids
=
frontend
.
get_input_ids_ssml
(
text
,
text
,
merge_sentences
=
merge_sentences
,
merge_sentences
=
merge_sentences
,
...
@@ -359,6 +365,7 @@ def run_frontend(
...
@@ -359,6 +365,7 @@ def run_frontend(
outs
.
update
({
'is_slurs'
:
is_slurs
})
outs
.
update
({
'is_slurs'
:
is_slurs
})
else
:
else
:
print
(
"lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!"
)
print
(
"lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!"
)
outs
.
update
({
'phone_ids'
:
phone_ids
})
outs
.
update
({
'phone_ids'
:
phone_ids
})
return
outs
return
outs
...
...
paddlespeech/t2s/exps/synthesize_e2e.py
浏览文件 @
42f2186d
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
# limitations under the License.
# limitations under the License.
import
argparse
import
argparse
from
pathlib
import
Path
from
pathlib
import
Path
from
pprint
import
pprint
import
paddle
import
paddle
import
soundfile
as
sf
import
soundfile
as
sf
...
@@ -78,6 +79,7 @@ def evaluate(args):
...
@@ -78,6 +79,7 @@ def evaluate(args):
# whether dygraph to static
# whether dygraph to static
if
args
.
inference_dir
:
if
args
.
inference_dir
:
print
(
"convert am and voc to static model."
)
# acoustic model
# acoustic model
am_inference
=
am_to_static
(
am_inference
=
am_to_static
(
am_inference
=
am_inference
,
am_inference
=
am_inference
,
...
@@ -92,6 +94,7 @@ def evaluate(args):
...
@@ -92,6 +94,7 @@ def evaluate(args):
output_dir
=
Path
(
args
.
output_dir
)
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
merge_sentences
=
False
merge_sentences
=
False
# Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
# Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
# but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
# but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
...
@@ -102,12 +105,18 @@ def evaluate(args):
...
@@ -102,12 +105,18 @@ def evaluate(args):
if
am_name
==
'speedyspeech'
:
if
am_name
==
'speedyspeech'
:
get_tone_ids
=
True
get_tone_ids
=
True
# wav samples
N
=
0
N
=
0
# inference time cost
T
=
0
T
=
0
# [(uid, text), ]
if
am_name
==
'diffsinger'
:
if
am_name
==
'diffsinger'
:
sentences
=
get_sentences_svs
(
text_file
=
args
.
text
)
sentences
=
get_sentences_svs
(
text_file
=
args
.
text
)
else
:
else
:
sentences
=
get_sentences
(
text_file
=
args
.
text
,
lang
=
args
.
lang
)
sentences
=
get_sentences
(
text_file
=
args
.
text
,
lang
=
args
.
lang
)
pprint
(
f
"inputs:
{
sentences
}
"
)
for
utt_id
,
sentence
in
sentences
:
for
utt_id
,
sentence
in
sentences
:
with
timer
()
as
t
:
with
timer
()
as
t
:
if
am_name
==
"diffsinger"
:
if
am_name
==
"diffsinger"
:
...
@@ -116,6 +125,8 @@ def evaluate(args):
...
@@ -116,6 +125,8 @@ def evaluate(args):
else
:
else
:
text
=
sentence
text
=
sentence
svs_input
=
None
svs_input
=
None
# frontend
frontend_dict
=
run_frontend
(
frontend_dict
=
run_frontend
(
frontend
=
frontend
,
frontend
=
frontend
,
text
=
text
,
text
=
text
,
...
@@ -124,25 +135,33 @@ def evaluate(args):
...
@@ -124,25 +135,33 @@ def evaluate(args):
lang
=
args
.
lang
,
lang
=
args
.
lang
,
svs_input
=
svs_input
)
svs_input
=
svs_input
)
phone_ids
=
frontend_dict
[
'phone_ids'
]
phone_ids
=
frontend_dict
[
'phone_ids'
]
# pprint(f"process: {utt_id} {phone_ids}")
with
paddle
.
no_grad
():
with
paddle
.
no_grad
():
flags
=
0
flags
=
0
for
i
in
range
(
len
(
phone_ids
)):
for
i
in
range
(
len
(
phone_ids
)):
# sub phone, split by `sp` or punctuation.
part_phone_ids
=
phone_ids
[
i
]
part_phone_ids
=
phone_ids
[
i
]
# acoustic model
# acoustic model
if
am_name
==
'fastspeech2'
:
if
am_name
==
'fastspeech2'
:
# multi speaker
# multi speaker
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
,
"canton"
}:
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
,
"canton"
}:
# multi-speaker
spk_id
=
paddle
.
to_tensor
(
args
.
spk_id
)
spk_id
=
paddle
.
to_tensor
(
args
.
spk_id
)
mel
=
am_inference
(
part_phone_ids
,
spk_id
)
mel
=
am_inference
(
part_phone_ids
,
spk_id
)
else
:
else
:
# single-speaker
mel
=
am_inference
(
part_phone_ids
)
mel
=
am_inference
(
part_phone_ids
)
elif
am_name
==
'speedyspeech'
:
elif
am_name
==
'speedyspeech'
:
part_tone_ids
=
frontend_dict
[
'tone_ids'
][
i
]
part_tone_ids
=
frontend_dict
[
'tone_ids'
][
i
]
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
}:
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
}:
# multi-speaker
spk_id
=
paddle
.
to_tensor
(
args
.
spk_id
)
spk_id
=
paddle
.
to_tensor
(
args
.
spk_id
)
mel
=
am_inference
(
part_phone_ids
,
part_tone_ids
,
mel
=
am_inference
(
part_phone_ids
,
part_tone_ids
,
spk_id
)
spk_id
)
else
:
else
:
# single-speaker
mel
=
am_inference
(
part_phone_ids
,
part_tone_ids
)
mel
=
am_inference
(
part_phone_ids
,
part_tone_ids
)
elif
am_name
==
'tacotron2'
:
elif
am_name
==
'tacotron2'
:
mel
=
am_inference
(
part_phone_ids
)
mel
=
am_inference
(
part_phone_ids
)
...
@@ -155,6 +174,7 @@ def evaluate(args):
...
@@ -155,6 +174,7 @@ def evaluate(args):
note
=
part_note_ids
,
note
=
part_note_ids
,
note_dur
=
part_note_durs
,
note_dur
=
part_note_durs
,
is_slur
=
part_is_slurs
,
)
is_slur
=
part_is_slurs
,
)
# vocoder
# vocoder
wav
=
voc_inference
(
mel
)
wav
=
voc_inference
(
mel
)
if
flags
==
0
:
if
flags
==
0
:
...
@@ -162,17 +182,23 @@ def evaluate(args):
...
@@ -162,17 +182,23 @@ def evaluate(args):
flags
=
1
flags
=
1
else
:
else
:
wav_all
=
paddle
.
concat
([
wav_all
,
wav
])
wav_all
=
paddle
.
concat
([
wav_all
,
wav
])
wav
=
wav_all
.
numpy
()
wav
=
wav_all
.
numpy
()
N
+=
wav
.
size
N
+=
wav
.
size
T
+=
t
.
elapse
T
+=
t
.
elapse
# samples per second
speed
=
wav
.
size
/
t
.
elapse
speed
=
wav
.
size
/
t
.
elapse
# generate one second wav need `RTF` seconds
rtf
=
am_config
.
fs
/
speed
rtf
=
am_config
.
fs
/
speed
print
(
print
(
f
"
{
utt_id
}
, mel:
{
mel
.
shape
}
, wave:
{
wav
.
shape
}
, time:
{
t
.
elapse
}
s, Hz:
{
speed
}
, RTF:
{
rtf
}
."
f
"
{
utt_id
}
, mel:
{
mel
.
shape
}
, wave:
{
wav
.
shape
}
, time:
{
t
.
elapse
}
s, Hz:
{
speed
}
, RTF:
{
rtf
}
."
)
)
sf
.
write
(
sf
.
write
(
str
(
output_dir
/
(
utt_id
+
".wav"
)),
wav
,
samplerate
=
am_config
.
fs
)
str
(
output_dir
/
(
utt_id
+
".wav"
)),
wav
,
samplerate
=
am_config
.
fs
)
print
(
f
"
{
utt_id
}
done!"
)
print
(
f
"
{
utt_id
}
done!"
)
print
(
f
"generation speed:
{
N
/
T
}
Hz, RTF:
{
am_config
.
fs
/
(
N
/
T
)
}
"
)
print
(
f
"generation speed:
{
N
/
T
}
Hz, RTF:
{
am_config
.
fs
/
(
N
/
T
)
}
"
)
...
...
paddlespeech/t2s/frontend/arpabet.py
浏览文件 @
42f2186d
...
@@ -11,7 +11,6 @@
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
paddlespeech.t2s.frontend.phonectic
import
Phonetics
"""
"""
A phonology system with ARPABET symbols and limited punctuations. The G2P
A phonology system with ARPABET symbols and limited punctuations. The G2P
conversion is done by g2p_en.
conversion is done by g2p_en.
...
@@ -19,55 +18,68 @@ conversion is done by g2p_en.
...
@@ -19,55 +18,68 @@ conversion is done by g2p_en.
Note that g2p_en does not handle words with hypen well. So make sure the input
Note that g2p_en does not handle words with hypen well. So make sure the input
sentence is first normalized.
sentence is first normalized.
"""
"""
from
paddlespeech.t2s.frontend.vocab
import
Vocab
from
g2p_en
import
G2p
from
g2p_en
import
G2p
from
paddlespeech.t2s.frontend.phonectic
import
Phonetics
from
paddlespeech.t2s.frontend.vocab
import
Vocab
class
ARPABET
(
Phonetics
):
class
ARPABET
(
Phonetics
):
"""A phonology for English that uses ARPABET as the phoneme vocabulary.
"""A phonology for English that uses ARPABET without stress as the phoneme vocabulary.
47 symbols = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
The current phoneme set contains 39 phonemes, vowels carry a lexical stress marker:
0 — No stress
1 — Primary stress
2 — Secondary stress
Phoneme Set:
Phoneme Example Translation
------- ------- -----------
AA odd AA D
AE at AE T
AH hut HH AH T
AO ought AO T
AW cow K AW
AY hide HH AY D
B be B IY
CH cheese CH IY Z
D dee D IY
DH thee DH IY
EH Ed EH D
ER hurt HH ER T
EY ate EY T
F fee F IY
G green G R IY N
HH he HH IY
IH it IH T
IY eat IY T
JH gee JH IY
K key K IY
L lee L IY
M me M IY
N knee N IY
NG ping P IH NG
OW oat OW T
OY toy T OY
P pee P IY
R read R IY D
S sea S IY
SH she SH IY
T tea T IY
TH theta TH EY T AH
UH hood HH UH D
UW two T UW
V vee V IY
W we W IY
Y yield Y IY L D
Z zee Z IY
ZH seizure S IY ZH ER
See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
Phoneme Example Translation
------- ------- -----------
AA odd AA D
AE at AE T
AH hut HH AH T
AO ought AO T
AW cow K AW
AY hide HH AY D
B be B IY
CH cheese CH IY Z
D dee D IY
DH thee DH IY
EH Ed EH D
ER hurt HH ER T
EY ate EY T
F fee F IY
G green G R IY N
HH he HH IY
IH it IH T
IY eat IY T
JH gee JH IY
K key K IY
L lee L IY
M me M IY
N knee N IY
NG ping P IH NG
OW oat OW T
OY toy T OY
P pee P IY
R read R IY D
S sea S IY
SH she SH IY
T tea T IY
TH theta TH EY T AH
UH hood HH UH D
UW two T UW
V vee V IY
W we W IY
Y yield Y IY L D
Z zee Z IY
ZH seizure S IY ZH ER
"""
"""
# 39 phonemes
phonemes
=
[
phonemes
=
[
'AA'
,
'AE'
,
'AH'
,
'AO'
,
'AW'
,
'AY'
,
'B'
,
'CH'
,
'D'
,
'DH'
,
'EH'
,
'ER'
,
'AA'
,
'AE'
,
'AH'
,
'AO'
,
'AW'
,
'AY'
,
'B'
,
'CH'
,
'D'
,
'DH'
,
'EH'
,
'ER'
,
'EY'
,
'F'
,
'G'
,
'HH'
,
'IH'
,
'IY'
,
'JH'
,
'K'
,
'L'
,
'M'
,
'N'
,
'NG'
,
'OW'
,
'EY'
,
'F'
,
'G'
,
'HH'
,
'IH'
,
'IY'
,
'JH'
,
'K'
,
'L'
,
'M'
,
'N'
,
'NG'
,
'OW'
,
...
@@ -76,6 +88,8 @@ class ARPABET(Phonetics):
...
@@ -76,6 +88,8 @@ class ARPABET(Phonetics):
]
]
punctuations
=
[
','
,
'.'
,
'?'
,
'!'
]
punctuations
=
[
','
,
'.'
,
'?'
,
'!'
]
symbols
=
phonemes
+
punctuations
symbols
=
phonemes
+
punctuations
# vowels carry a lexical stress marker:
# 0 unstressed(无重音), 1 primary stress(主重音)和 2 secondary stress(次重音)
_stress_to_no_stress_
=
{
_stress_to_no_stress_
=
{
'AA0'
:
'AA'
,
'AA0'
:
'AA'
,
'AA1'
:
'AA'
,
'AA1'
:
'AA'
,
...
@@ -124,7 +138,12 @@ class ARPABET(Phonetics):
...
@@ -124,7 +138,12 @@ class ARPABET(Phonetics):
'UW2'
:
'UW'
'UW2'
:
'UW'
}
}
def
__repr__
(
self
):
fmt
=
"ARPABETWithoutStress(phonemes: {}, punctuations: {})"
return
fmt
.
format
(
len
(
phonemes
),
punctuations
)
def
__init__
(
self
):
def
__init__
(
self
):
# https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
self
.
backend
=
G2p
()
self
.
backend
=
G2p
()
self
.
vocab
=
Vocab
(
self
.
phonemes
+
self
.
punctuations
)
self
.
vocab
=
Vocab
(
self
.
phonemes
+
self
.
punctuations
)
...
@@ -139,6 +158,7 @@ class ARPABET(Phonetics):
...
@@ -139,6 +158,7 @@ class ARPABET(Phonetics):
Returns:
Returns:
List[str]: The list of pronunciation sequence.
List[str]: The list of pronunciation sequence.
"""
"""
# g2p and remove vowel stress
phonemes
=
[
phonemes
=
[
self
.
_remove_vowels
(
item
)
for
item
in
self
.
backend
(
sentence
)
self
.
_remove_vowels
(
item
)
for
item
in
self
.
backend
(
sentence
)
]
]
...
@@ -158,6 +178,7 @@ class ARPABET(Phonetics):
...
@@ -158,6 +178,7 @@ class ARPABET(Phonetics):
Returns:
Returns:
List[int]: The list of pronunciation id sequence.
List[int]: The list of pronunciation id sequence.
"""
"""
# phonemes to ids
ids
=
[
self
.
vocab
.
lookup
(
item
)
for
item
in
phonemes
]
ids
=
[
self
.
vocab
.
lookup
(
item
)
for
item
in
phonemes
]
return
ids
return
ids
...
@@ -189,11 +210,16 @@ class ARPABET(Phonetics):
...
@@ -189,11 +210,16 @@ class ARPABET(Phonetics):
def
vocab_size
(
self
):
def
vocab_size
(
self
):
""" Vocab size.
""" Vocab size.
"""
"""
# 47 = 39 phones + 4 punctuations + 4 special tokens
# 47 = 39 phones + 4 punctuations + 4 special tokens
(<pad> <unk> <s> </s>)
return
len
(
self
.
vocab
)
return
len
(
self
.
vocab
)
class
ARPABETWithStress
(
Phonetics
):
class
ARPABETWithStress
(
Phonetics
):
"""
A phonology for English that uses ARPABET with stress as the phoneme vocabulary.
77 symbols = 69 phones + 4 punctuations + 4 special tokens
"""
phonemes
=
[
phonemes
=
[
'AA0'
,
'AA1'
,
'AA2'
,
'AE0'
,
'AE1'
,
'AE2'
,
'AH0'
,
'AH1'
,
'AH2'
,
'AO0'
,
'AA0'
,
'AA1'
,
'AA2'
,
'AE0'
,
'AE1'
,
'AE2'
,
'AH0'
,
'AH1'
,
'AH2'
,
'AO0'
,
'AO1'
,
'AO2'
,
'AW0'
,
'AW1'
,
'AW2'
,
'AY0'
,
'AY1'
,
'AY2'
,
'B'
,
'CH'
,
'D'
,
'AO1'
,
'AO2'
,
'AW0'
,
'AW1'
,
'AW2'
,
'AY0'
,
'AY1'
,
'AY2'
,
'B'
,
'CH'
,
'D'
,
...
@@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics):
...
@@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics):
punctuations
=
[
','
,
'.'
,
'?'
,
'!'
]
punctuations
=
[
','
,
'.'
,
'?'
,
'!'
]
symbols
=
phonemes
+
punctuations
symbols
=
phonemes
+
punctuations
def
__repr__
(
self
):
fmt
=
"ARPABETWithStress(phonemes: {}, punctuations: {})"
return
fmt
.
format
(
len
(
phonemes
),
punctuations
)
def
__init__
(
self
):
def
__init__
(
self
):
self
.
backend
=
G2p
()
self
.
backend
=
G2p
()
self
.
vocab
=
Vocab
(
self
.
phonemes
+
self
.
punctuations
)
self
.
vocab
=
Vocab
(
self
.
phonemes
+
self
.
punctuations
)
...
...
paddlespeech/t2s/frontend/polyphonic.yaml
浏览文件 @
42f2186d
...
@@ -47,4 +47,5 @@ polyphonic:
...
@@ -47,4 +47,5 @@ polyphonic:
恶行
:
[
'
e4'
,
'
xing2'
]
恶行
:
[
'
e4'
,
'
xing2'
]
唉
:
[
'
ai4'
]
唉
:
[
'
ai4'
]
扎实
:
[
'
zha1'
,
'
shi2'
]
扎实
:
[
'
zha1'
,
'
shi2'
]
干将
:
[
'
gan4'
,
'
jiang4'
]
干将
:
[
'
gan4'
,
'
jiang4'
]
\ No newline at end of file
陈威行
:
[
'
chen2'
,
'
wei1'
,
'
hang2'
]
\ No newline at end of file
paddlespeech/t2s/ssml/__init__.py
→
paddlespeech/t2s/
frontend/
ssml/__init__.py
浏览文件 @
42f2186d
文件已移动
paddlespeech/t2s/ssml/xml_processor.py
→
paddlespeech/t2s/
frontend/
ssml/xml_processor.py
浏览文件 @
42f2186d
...
@@ -90,13 +90,14 @@ class MixTextProcessor():
...
@@ -90,13 +90,14 @@ class MixTextProcessor():
dom
=
DomXml
(
in_xml
)
dom
=
DomXml
(
in_xml
)
tags
=
dom
.
get_text_and_sayas_tags
()
tags
=
dom
.
get_text_and_sayas_tags
()
ctlist
.
extend
(
tags
)
ctlist
.
extend
(
tags
)
ctlist
.
append
(
after_xml
)
ctlist
.
append
(
after_xml
)
return
ctlist
return
ctlist
else
:
else
:
ctlist
.
append
(
mixstr
)
ctlist
.
append
(
mixstr
)
return
ctlist
return
ctlist
class
DomXml
():
class
DomXml
():
def
__init__
(
self
,
xmlstr
):
def
__init__
(
self
,
xmlstr
):
self
.
tdom
=
parseString
(
xmlstr
)
#Document
self
.
tdom
=
parseString
(
xmlstr
)
#Document
...
...
paddlespeech/t2s/frontend/tone_sandhi.py
浏览文件 @
42f2186d
...
@@ -20,6 +20,9 @@ from pypinyin import Style
...
@@ -20,6 +20,9 @@ from pypinyin import Style
class
ToneSandhi
():
class
ToneSandhi
():
def
__repr__
(
self
):
return
"MandarinToneSandhi"
def
__init__
(
self
):
def
__init__
(
self
):
self
.
must_neural_tone_words
=
{
self
.
must_neural_tone_words
=
{
'麻烦'
,
'麻利'
,
'鸳鸯'
,
'高粱'
,
'骨头'
,
'骆驼'
,
'马虎'
,
'首饰'
,
'馒头'
,
'馄饨'
,
'风筝'
,
'麻烦'
,
'麻利'
,
'鸳鸯'
,
'高粱'
,
'骨头'
,
'骆驼'
,
'马虎'
,
'首饰'
,
'馒头'
,
'馄饨'
,
'风筝'
,
...
@@ -69,6 +72,19 @@ class ToneSandhi():
...
@@ -69,6 +72,19 @@ class ToneSandhi():
}
}
self
.
punc
=
":,;。?!“”‘’':,;.?!"
self
.
punc
=
":,;。?!“”‘’':,;.?!"
def
_split_word
(
self
,
word
:
str
)
->
List
[
str
]:
word_list
=
jieba
.
cut_for_search
(
word
)
word_list
=
sorted
(
word_list
,
key
=
lambda
i
:
len
(
i
),
reverse
=
False
)
first_subword
=
word_list
[
0
]
first_begin_idx
=
word
.
find
(
first_subword
)
if
first_begin_idx
==
0
:
second_subword
=
word
[
len
(
first_subword
):]
new_word_list
=
[
first_subword
,
second_subword
]
else
:
second_subword
=
word
[:
-
len
(
first_subword
)]
new_word_list
=
[
second_subword
,
first_subword
]
return
new_word_list
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
# e.g.
# e.g.
# word: "家里"
# word: "家里"
...
@@ -154,18 +170,8 @@ class ToneSandhi():
...
@@ -154,18 +170,8 @@ class ToneSandhi():
finals
[
i
]
=
finals
[
i
][:
-
1
]
+
"4"
finals
[
i
]
=
finals
[
i
][:
-
1
]
+
"4"
return
finals
return
finals
def
_split_word
(
self
,
word
:
str
)
->
List
[
str
]:
def
_all_tone_three
(
self
,
finals
:
List
[
str
])
->
bool
:
word_list
=
jieba
.
cut_for_search
(
word
)
return
all
(
x
[
-
1
]
==
"3"
for
x
in
finals
)
word_list
=
sorted
(
word_list
,
key
=
lambda
i
:
len
(
i
),
reverse
=
False
)
first_subword
=
word_list
[
0
]
first_begin_idx
=
word
.
find
(
first_subword
)
if
first_begin_idx
==
0
:
second_subword
=
word
[
len
(
first_subword
):]
new_word_list
=
[
first_subword
,
second_subword
]
else
:
second_subword
=
word
[:
-
len
(
first_subword
)]
new_word_list
=
[
second_subword
,
first_subword
]
return
new_word_list
def
_three_sandhi
(
self
,
word
:
str
,
finals
:
List
[
str
])
->
List
[
str
]:
def
_three_sandhi
(
self
,
word
:
str
,
finals
:
List
[
str
])
->
List
[
str
]:
...
@@ -207,9 +213,6 @@ class ToneSandhi():
...
@@ -207,9 +213,6 @@ class ToneSandhi():
return
finals
return
finals
def
_all_tone_three
(
self
,
finals
:
List
[
str
])
->
bool
:
return
all
(
x
[
-
1
]
==
"3"
for
x
in
finals
)
# merge "不" and the word behind it
# merge "不" and the word behind it
# if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
# if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
def
_merge_bu
(
self
,
seg
:
List
[
Tuple
[
str
,
str
]])
->
List
[
Tuple
[
str
,
str
]]:
def
_merge_bu
(
self
,
seg
:
List
[
Tuple
[
str
,
str
]])
->
List
[
Tuple
[
str
,
str
]]:
...
@@ -336,6 +339,9 @@ class ToneSandhi():
...
@@ -336,6 +339,9 @@ class ToneSandhi():
def
pre_merge_for_modify
(
def
pre_merge_for_modify
(
self
,
seg
:
List
[
Tuple
[
str
,
str
]])
->
List
[
Tuple
[
str
,
str
]]:
self
,
seg
:
List
[
Tuple
[
str
,
str
]])
->
List
[
Tuple
[
str
,
str
]]:
"""
seg: [(word, pos), ...]
"""
seg
=
self
.
_merge_bu
(
seg
)
seg
=
self
.
_merge_bu
(
seg
)
seg
=
self
.
_merge_yi
(
seg
)
seg
=
self
.
_merge_yi
(
seg
)
seg
=
self
.
_merge_reduplication
(
seg
)
seg
=
self
.
_merge_reduplication
(
seg
)
...
@@ -346,7 +352,11 @@ class ToneSandhi():
...
@@ -346,7 +352,11 @@ class ToneSandhi():
def
modified_tone
(
self
,
word
:
str
,
pos
:
str
,
def
modified_tone
(
self
,
word
:
str
,
pos
:
str
,
finals
:
List
[
str
])
->
List
[
str
]:
finals
:
List
[
str
])
->
List
[
str
]:
"""
word: 分词
pos: 词性
finals: 带调韵母, [final1, ..., finaln]
"""
finals
=
self
.
_bu_sandhi
(
word
,
finals
)
finals
=
self
.
_bu_sandhi
(
word
,
finals
)
finals
=
self
.
_yi_sandhi
(
word
,
finals
)
finals
=
self
.
_yi_sandhi
(
word
,
finals
)
finals
=
self
.
_neural_sandhi
(
word
,
pos
,
finals
)
finals
=
self
.
_neural_sandhi
(
word
,
pos
,
finals
)
...
...
paddlespeech/t2s/frontend/zh_frontend.py
浏览文件 @
42f2186d
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录