未验证 提交 b0607485 编写于 作者: H Hui Zhang 提交者: GitHub

Merge pull request #3316 from zh794390558/frontend

[t2s] fix Frontend for lang sentence OOM/ update Polyphonic dict  / fix 0d-tensor / move ssml to frontend /  fix librosa version / hotfix english G2P
......@@ -26,12 +26,12 @@ repos:
- --no-sort-keys
- --autofix
- id: check-merge-conflict
- id: flake8
aergs:
- --ignore=E501,E228,E226,E261,E266,E128,E402,W503
- --builtins=G,request
- --jobs=1
exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
# - id: flake8
# aergs:
# - --ignore=E501,E228,E226,E261,E266,E128,E402,W503
# - --builtins=G,request
# - --jobs=1
# exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
- repo : https://github.com/Lucas-C/pre-commit-hooks
rev: v1.0.1
......
......@@ -38,8 +38,10 @@ VERSION = '1.2.0'
COMMITID = 'none'
base = [
# paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
"librosa==0.8.1",
"numpy==1.23.5",
"kaldiio",
"librosa>=0.10.0",
"pathos",
"pybind11",
"parameterized",
......
......@@ -28,7 +28,7 @@ from paddlespeech.server.utils.audio_process import float2pcm
from paddlespeech.server.utils.onnx_infer import get_sess
from paddlespeech.server.utils.util import denorm
from paddlespeech.server.utils.util import get_chunks
from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.en_frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend
__all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']
......
......@@ -29,7 +29,7 @@ from paddlespeech.server.engine.base_engine import BaseEngine
from paddlespeech.server.utils.audio_process import float2pcm
from paddlespeech.server.utils.util import denorm
from paddlespeech.server.utils.util import get_chunks
from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.en_frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.modules.normalizer import ZScore
......
......@@ -32,7 +32,7 @@ from paddlespeech.server.utils.errors import ErrorCode
from paddlespeech.server.utils.exception import ServerBaseException
from paddlespeech.server.utils.paddle_predictor import init_predictor
from paddlespeech.server.utils.paddle_predictor import run_model
from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.en_frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend
__all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']
......
......@@ -18,6 +18,5 @@ from . import exps
from . import frontend
from . import models
from . import modules
from . import ssml
from . import training
from . import utils
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -5,4 +5,5 @@
005 Paddle Bo Bo: 使用 Paddle Speech 的语音合成模块生成虚拟人的声音。
006 热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!
007 我喜欢 eat apple, 你喜欢 drink milk。
008 我们要去云南 team building, 非常非常 happy.
\ No newline at end of file
008 我们要去云南 team building, 非常非常 happy.
009 AI for Sceience 平台。
\ No newline at end of file
......@@ -33,8 +33,8 @@ from yacs.config import CfgNode
from paddlespeech.t2s.datasets.am_batch_fn import *
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static
from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.canton_frontend import CantonFrontend
from paddlespeech.t2s.frontend.en_frontend import English
from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
from paddlespeech.t2s.frontend.sing_frontend import SingFrontend
from paddlespeech.t2s.frontend.zh_frontend import Frontend
......@@ -99,14 +99,23 @@ def norm(data, mean, std):
return (data - mean) / std
def get_chunks(data, block_size: int, pad_size: int):
data_len = data.shape[1]
def get_chunks(mel, chunk_size: int, pad_size: int):
"""
Split mel by chunk size with left and right context.
Args:
mel (paddle.Tensor): mel spectrogram, shape (B, T, D)
chunk_size (int): chunk size
pad_size (int): size for left and right context.
"""
T = mel.shape[1]
n = math.ceil(T / chunk_size)
chunks = []
n = math.ceil(data_len / block_size)
for i in range(n):
start = max(0, i * block_size - pad_size)
end = min((i + 1) * block_size + pad_size, data_len)
chunks.append(data[:, start:end, :])
start = max(0, i * chunk_size - pad_size)
end = min((i + 1) * chunk_size + pad_size, T)
chunks.append(mel[:, start:end, :])
return chunks
......@@ -117,14 +126,10 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
with open(text_file, 'rt', encoding='utf-8') as f:
for line in f:
if line.strip() != "":
items = re.split(r"\s+", line.strip(), 1)
items = re.split(r"\s+", line.strip(), maxsplit=1)
assert len(items) == 2
utt_id = items[0]
if lang in {'zh', 'canton'}:
sentence = "".join(items[1:])
elif lang == 'en':
sentence = " ".join(items[1:])
elif lang == 'mix':
sentence = " ".join(items[1:])
sentence = items[1]
sentences.append((utt_id, sentence))
return sentences
......@@ -319,6 +324,7 @@ def run_frontend(
input_ids = {}
if text.strip() != "" and re.match(r".*?<speak>.*?</speak>.*", text,
re.DOTALL):
# using ssml
input_ids = frontend.get_input_ids_ssml(
text,
merge_sentences=merge_sentences,
......@@ -359,6 +365,7 @@ def run_frontend(
outs.update({'is_slurs': is_slurs})
else:
print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!")
outs.update({'phone_ids': phone_ids})
return outs
......
......@@ -13,6 +13,7 @@
# limitations under the License.
import argparse
from pathlib import Path
from pprint import pprint
import paddle
import soundfile as sf
......@@ -78,6 +79,7 @@ def evaluate(args):
# whether dygraph to static
if args.inference_dir:
print("convert am and voc to static model.")
# acoustic model
am_inference = am_to_static(
am_inference=am_inference,
......@@ -92,6 +94,7 @@ def evaluate(args):
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
merge_sentences = False
# Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
# but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
......@@ -102,13 +105,19 @@ def evaluate(args):
if am_name == 'speedyspeech':
get_tone_ids = True
# wav samples
N = 0
# inference time cost
T = 0
# [(uid, text), ]
if am_name == 'diffsinger':
sentences = get_sentences_svs(text_file=args.text)
else:
sentences = get_sentences(text_file=args.text, lang=args.lang)
for utt_id, sentence in sentences:
print(f"{utt_id} {sentence}")
with timer() as t:
if am_name == "diffsinger":
text = ""
......@@ -116,6 +125,8 @@ def evaluate(args):
else:
text = sentence
svs_input = None
# frontend
frontend_dict = run_frontend(
frontend=frontend,
text=text,
......@@ -124,25 +135,33 @@ def evaluate(args):
lang=args.lang,
svs_input=svs_input)
phone_ids = frontend_dict['phone_ids']
# pprint(f"{utt_id} {phone_ids}")
with paddle.no_grad():
flags = 0
for i in range(len(phone_ids)):
# sub phone, split by `sp` or punctuation.
part_phone_ids = phone_ids[i]
# acoustic model
if am_name == 'fastspeech2':
# multi speaker
if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
spk_id = paddle.to_tensor(args.spk_id)
# multi-speaker
spk_id = paddle.to_tensor([args.spk_id])
mel = am_inference(part_phone_ids, spk_id)
else:
# single-speaker
mel = am_inference(part_phone_ids)
elif am_name == 'speedyspeech':
part_tone_ids = frontend_dict['tone_ids'][i]
if am_dataset in {"aishell3", "vctk", "mix"}:
spk_id = paddle.to_tensor(args.spk_id)
# multi-speaker
spk_id = paddle.to_tensor([args.spk_id])
mel = am_inference(part_phone_ids, part_tone_ids,
spk_id)
else:
# single-speaker
mel = am_inference(part_phone_ids, part_tone_ids)
elif am_name == 'tacotron2':
mel = am_inference(part_phone_ids)
......@@ -155,6 +174,7 @@ def evaluate(args):
note=part_note_ids,
note_dur=part_note_durs,
is_slur=part_is_slurs, )
# vocoder
wav = voc_inference(mel)
if flags == 0:
......@@ -162,17 +182,23 @@ def evaluate(args):
flags = 1
else:
wav_all = paddle.concat([wav_all, wav])
wav = wav_all.numpy()
N += wav.size
T += t.elapse
# samples per second
speed = wav.size / t.elapse
# generate one second wav need `RTF` seconds
rtf = am_config.fs / speed
print(
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
sf.write(
str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
print(f"{utt_id} done!")
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
......
......@@ -27,7 +27,7 @@ import yaml
from yacs.config import CfgNode as Configuration
from paddlespeech.t2s.datasets.get_feats import LogMelFBank
from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.en_frontend import English
def get_lj_sentences(file_name, frontend):
......
......@@ -21,7 +21,7 @@ import soundfile as sf
import yaml
from yacs.config import CfgNode
from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.en_frontend import English
from paddlespeech.t2s.models.transformer_tts import TransformerTTS
from paddlespeech.t2s.models.transformer_tts import TransformerTTSInference
from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
......
......@@ -13,8 +13,8 @@
# limitations under the License.
from .generate_lexicon import *
from .normalizer import *
from .phonectic import *
from .punctuation import *
from .ssml import *
from .tone_sandhi import *
from .vocab import *
from .zh_normalization import *
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlespeech.t2s.frontend.phonectic import Phonetics
"""
A phonology system with ARPABET symbols and limited punctuations. The G2P
conversion is done by g2p_en.
......@@ -19,55 +18,68 @@ conversion is done by g2p_en.
Note that g2p_en does not handle words with hypen well. So make sure the input
sentence is first normalized.
"""
from paddlespeech.t2s.frontend.vocab import Vocab
from g2p_en import G2p
from paddlespeech.t2s.frontend.phonectic import Phonetics
from paddlespeech.t2s.frontend.vocab import Vocab
class ARPABET(Phonetics):
"""A phonology for English that uses ARPABET as the phoneme vocabulary.
"""A phonology for English that uses ARPABET without stress as the phoneme vocabulary.
47 symbols = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
The current phoneme set contains 39 phonemes, vowels carry a lexical stress marker:
0 — No stress
1 — Primary stress
2 — Secondary stress
Phoneme Set:
Phoneme Example Translation
------- ------- -----------
AA odd AA D
AE at AE T
AH hut HH AH T
AO ought AO T
AW cow K AW
AY hide HH AY D
B be B IY
CH cheese CH IY Z
D dee D IY
DH thee DH IY
EH Ed EH D
ER hurt HH ER T
EY ate EY T
F fee F IY
G green G R IY N
HH he HH IY
IH it IH T
IY eat IY T
JH gee JH IY
K key K IY
L lee L IY
M me M IY
N knee N IY
NG ping P IH NG
OW oat OW T
OY toy T OY
P pee P IY
R read R IY D
S sea S IY
SH she SH IY
T tea T IY
TH theta TH EY T AH
UH hood HH UH D
UW two T UW
V vee V IY
W we W IY
Y yield Y IY L D
Z zee Z IY
ZH seizure S IY ZH ER
See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
Phoneme Example Translation
------- ------- -----------
AA odd AA D
AE at AE T
AH hut HH AH T
AO ought AO T
AW cow K AW
AY hide HH AY D
B be B IY
CH cheese CH IY Z
D dee D IY
DH thee DH IY
EH Ed EH D
ER hurt HH ER T
EY ate EY T
F fee F IY
G green G R IY N
HH he HH IY
IH it IH T
IY eat IY T
JH gee JH IY
K key K IY
L lee L IY
M me M IY
N knee N IY
NG ping P IH NG
OW oat OW T
OY toy T OY
P pee P IY
R read R IY D
S sea S IY
SH she SH IY
T tea T IY
TH theta TH EY T AH
UH hood HH UH D
UW two T UW
V vee V IY
W we W IY
Y yield Y IY L D
Z zee Z IY
ZH seizure S IY ZH ER
"""
# 39 phonemes
phonemes = [
'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER',
'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW',
......@@ -76,6 +88,8 @@ class ARPABET(Phonetics):
]
punctuations = [',', '.', '?', '!']
symbols = phonemes + punctuations
# vowels carry a lexical stress marker:
# 0 unstressed(无重音), 1 primary stress(主重音)和 2 secondary stress(次重音)
_stress_to_no_stress_ = {
'AA0': 'AA',
'AA1': 'AA',
......@@ -124,7 +138,12 @@ class ARPABET(Phonetics):
'UW2': 'UW'
}
def __repr__(self):
fmt = "ARPABETWithoutStress(phonemes: {}, punctuations: {})"
return fmt.format(len(phonemes), punctuations)
def __init__(self):
# https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
self.backend = G2p()
self.vocab = Vocab(self.phonemes + self.punctuations)
......@@ -139,6 +158,7 @@ class ARPABET(Phonetics):
Returns:
List[str]: The list of pronunciation sequence.
"""
# g2p and remove vowel stress
phonemes = [
self._remove_vowels(item) for item in self.backend(sentence)
]
......@@ -158,6 +178,7 @@ class ARPABET(Phonetics):
Returns:
List[int]: The list of pronunciation id sequence.
"""
# phonemes to ids
ids = [self.vocab.lookup(item) for item in phonemes]
return ids
......@@ -189,11 +210,16 @@ class ARPABET(Phonetics):
def vocab_size(self):
""" Vocab size.
"""
# 47 = 39 phones + 4 punctuations + 4 special tokens
# 47 = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
return len(self.vocab)
class ARPABETWithStress(Phonetics):
"""
A phonology for English that uses ARPABET with stress as the phoneme vocabulary.
77 symbols = 69 phones + 4 punctuations + 4 special tokens
"""
phonemes = [
'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
......@@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics):
punctuations = [',', '.', '?', '!']
symbols = phonemes + punctuations
def __repr__(self):
fmt = "ARPABETWithStress(phonemes: {}, punctuations: {})"
return fmt.format(len(phonemes), punctuations)
def __init__(self):
self.backend = G2p()
self.vocab = Vocab(self.phonemes + self.punctuations)
......
......@@ -29,7 +29,8 @@ INITIALS = [
INITIALS += ['sp', 'spl', 'spn', 'sil']
def get_lines(cantons: List[str]):
def jyuping_to_phonemes(cantons: List[str]):
# jyuping to inital and final
phones = []
for canton in cantons:
for consonant in INITIALS:
......@@ -47,7 +48,7 @@ def get_lines(cantons: List[str]):
class CantonFrontend():
def __init__(self, phone_vocab_path: str):
self.text_normalizer = TextNormalizer()
self.punc = ":,;。?!“”‘’':,;.?!"
self.punc = ":,;。?!“”‘’':,;.?!"
self.vocab_phones = {}
if phone_vocab_path:
......@@ -61,8 +62,11 @@ class CantonFrontend():
merge_sentences: bool=True) -> List[List[str]]:
phones_list = []
for sentence in sentences:
# jyuping
# 'gam3 ngaam1 lou5 sai3 jiu1 kau4 keoi5 dang2 zan6 jiu3 hoi1 wui2, zing6 dai1 ge2 je5 ngo5 wui5 gaau2 dim6 ga3 laa3.'
phones_str = ToJyutping.get_jyutping_text(sentence)
phones_split = get_lines(phones_str.split(' '))
# phonemes
phones_split = jyuping_to_phonemes(phones_str.split(' '))
phones_list.append(phones_split)
return phones_list
......@@ -78,8 +82,11 @@ class CantonFrontend():
sentence: str,
merge_sentences: bool=True,
print_info: bool=False) -> List[List[str]]:
# TN & Text Segmentation
sentences = self.text_normalizer.normalize(sentence)
# G2P
phonemes = self._g2p(sentences, merge_sentences=merge_sentences)
if print_info:
print("----------------------------")
print("text norm results:")
......@@ -88,6 +95,7 @@ class CantonFrontend():
print("g2p results:")
print(phonemes)
print("----------------------------")
return phonemes
def get_input_ids(self,
......@@ -98,9 +106,9 @@ class CantonFrontend():
phonemes = self.get_phonemes(
sentence, merge_sentences=merge_sentences, print_info=print_info)
result = {}
temp_phone_ids = []
for phones in phonemes:
if phones:
phone_ids = self._p2id(phones)
......@@ -108,6 +116,8 @@ class CantonFrontend():
if to_tensor:
phone_ids = paddle.to_tensor(phone_ids)
temp_phone_ids.append(phone_ids)
if temp_phone_ids:
result["phone_ids"] = temp_phone_ids
return result
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .phonectic import English
......@@ -18,9 +18,9 @@ from typing import List
import numpy as np
import paddle
from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend
from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
from paddlespeech.t2s.frontend.zh_frontend import Frontend as ZhFrontend
class MixFrontend():
......@@ -28,10 +28,9 @@ class MixFrontend():
g2p_model="pypinyin",
phone_vocab_path=None,
tone_vocab_path=None):
self.zh_frontend = Frontend(
self.zh_frontend = ZhFrontend(
phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path)
self.en_frontend = English(phone_vocab_path=phone_vocab_path)
self.en_frontend = EnFrontend(phone_vocab_path=phone_vocab_path)
self.sp_id = self.zh_frontend.vocab_phones["sp"]
self.sp_id_numpy = np.array([self.sp_id])
self.sp_id_tensor = paddle.to_tensor([self.sp_id])
......@@ -55,15 +54,12 @@ class MixFrontend():
else:
return False
def get_segment(self, text: str) -> List[str]:
def split_by_lang(self, text: str) -> List[str]:
# sentence --> [ch_part, en_part, ch_part, ...]
segments = []
types = []
flag = 0
temp_seg = ""
temp_lang = ""
# Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
# Determine the type of each character. type: chinese, alphabet, other.
for ch in text:
if self.is_chinese(ch):
types.append("zh")
......@@ -74,31 +70,31 @@ class MixFrontend():
assert len(types) == len(text)
for i in range(len(types)):
flag = 0
temp_seg = ""
temp_lang = ""
for i in range(len(text)):
# find the first char of the seg
if flag == 0:
temp_seg += text[i]
temp_lang = types[i]
flag = 1
else:
if temp_lang == "other":
if types[i] == temp_lang:
temp_seg += text[i]
else:
temp_seg += text[i]
# text start is not lang.
temp_seg += text[i]
if types[i] != temp_lang:
temp_lang = types[i]
else:
if types[i] == temp_lang:
temp_seg += text[i]
elif types[i] == "other":
if types[i] == temp_lang or types[i] == "other":
# merge same lang or other
temp_seg += text[i]
else:
# change lang
segments.append((temp_seg, temp_lang))
temp_seg = text[i]
temp_lang = types[i]
flag = 1
temp_lang = types[i] # new lang
segments.append((temp_seg, temp_lang))
......@@ -110,76 +106,95 @@ class MixFrontend():
get_tone_ids: bool=False,
add_sp: bool=True,
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
''' 1. 添加SSML支持,先列出 文字 和 <say-as>标签内容,
然后添加到tmpSegments数组里
'''
d_inputs = MixTextProcessor.get_dom_split(sentence)
tmpSegments = []
for instr in d_inputs:
''' 暂时只支持 say-as '''
if instr.lower().startswith("<say-as"):
tmpSegments.append((instr, "zh"))
# XML Document Object Model (DOM)
doms = MixTextProcessor.get_dom_split(sentence)
lang_splits = []
for dom in doms:
if dom.lower().startswith("<say-as pinyin="):
# `<say-as pinyin=` for zh lang
lang_splits.append((dom, "zh"))
else:
tmpSegments.extend(self.get_segment(instr))
''' 2. 把zh的merge到一起,避免合成结果中间停顿
'''
# process zh, en and zh/en
lang_splits.extend(self.split_by_lang(dom))
# merge adjacent zh segment
segments = []
currentSeg = ["", ""]
for seg in tmpSegments:
for seg in lang_splits:
if seg[1] == "en" or seg[1] == "other":
if currentSeg[0] == '':
# first see
segments.append(seg)
else:
# zh
currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
segments.append(tuple(currentSeg))
# en
segments.append(seg)
# reset
currentSeg = ["", ""]
else:
# zh
if currentSeg[0] == '':
# first see
currentSeg[0] = seg[0]
currentSeg[1] = seg[1]
else:
# merge zh
currentSeg[0] = currentSeg[0] + seg[0]
if currentSeg[0] != '':
# last zh
currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
segments.append(tuple(currentSeg))
phones_list = []
result = {}
# 008 我们要去云南 team building, 非常非常 happy.
# seg ('我们要去云南 ', 'zh')
# seg ('team building, ', 'en')
# seg ('非常非常 ', 'zh')
# seg ('happy.', 'en')
# [('<speak>我们要去云南 </speak>', 'zh'), ('team building, ', 'en'), ('<speak>非常非常 </speak>', 'zh'), ('happy.', 'en')]
for seg in segments:
content = seg[0]
lang = seg[1]
if content != '':
if lang == "en":
input_ids = self.en_frontend.get_input_ids(
content, merge_sentences=False, to_tensor=to_tensor)
if not content:
continue
if lang == "en":
input_ids = self.en_frontend.get_input_ids(
content, merge_sentences=False, to_tensor=to_tensor)
else:
if content.strip() != "" and \
re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
# process ssml
input_ids = self.zh_frontend.get_input_ids_ssml(
content,
merge_sentences=False,
get_tone_ids=get_tone_ids,
to_tensor=to_tensor)
else:
''' 3. 把带speak tag的中文和普通文字分开处理
'''
if content.strip() != "" and \
re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
input_ids = self.zh_frontend.get_input_ids_ssml(
content,
merge_sentences=False,
get_tone_ids=get_tone_ids,
to_tensor=to_tensor)
else:
input_ids = self.zh_frontend.get_input_ids(
content,
merge_sentences=False,
get_tone_ids=get_tone_ids,
to_tensor=to_tensor)
if add_sp:
if to_tensor:
input_ids["phone_ids"][-1] = paddle.concat(
[input_ids["phone_ids"][-1], self.sp_id_tensor])
else:
input_ids["phone_ids"][-1] = np.concatenate(
(input_ids["phone_ids"][-1], self.sp_id_numpy))
# process plain text
input_ids = self.zh_frontend.get_input_ids(
content,
merge_sentences=False,
get_tone_ids=get_tone_ids,
to_tensor=to_tensor)
if add_sp:
# add sp between zh and en
if to_tensor:
input_ids["phone_ids"][-1] = paddle.concat(
[input_ids["phone_ids"][-1], self.sp_id_tensor])
else:
input_ids["phone_ids"][-1] = np.concatenate(
(input_ids["phone_ids"][-1], self.sp_id_numpy))
for phones in input_ids["phone_ids"]:
phones_list.append(phones)
phones_list.extend(input_ids["phone_ids"])
if merge_sentences:
merge_list = paddle.concat(phones_list)
......
......@@ -47,15 +47,34 @@ class Phonetics(ABC):
class English(Phonetics):
""" Normalize the input text sequence and convert into pronunciation id sequence.
https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + [
'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
'EY2', 'F', 'G', 'HH',
'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L',
'M', 'N', 'NG', 'OW0', 'OW1',
'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
'UH0', 'UH1', 'UH2', 'UW',
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
"""
LEXICON = {
# key using lowercase
"AI".lower(): [["EY0", "AY1"]],
}
def __init__(self, phone_vocab_path=None):
self.backend = G2p()
self.backend.cmu.update(English.LEXICON)
self.phonemes = list(self.backend.phonemes)
self.punctuations = get_punctuations("en")
self.vocab = Vocab(self.phonemes + self.punctuations)
self.vocab_phones = {}
self.punc = ":,;。?!“”‘’':,;.?!"
self.punc = ":,;。?!“”‘’':,;.?!"
self.text_normalizer = TextNormalizer()
if phone_vocab_path:
with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
......@@ -86,8 +105,8 @@ class English(Phonetics):
sentence: str,
merge_sentences: bool=False,
to_tensor: bool=True) -> paddle.Tensor:
result = {}
sentences = self.text_normalizer._split(sentence, lang="en")
phones_list = []
temp_phone_ids = []
for sentence in sentences:
......@@ -118,7 +137,10 @@ class English(Phonetics):
if to_tensor:
phone_ids = paddle.to_tensor(phone_ids)
temp_phone_ids.append(phone_ids)
result = {}
result["phone_ids"] = temp_phone_ids
return result
def numericalize(self, phonemes):
......
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import yaml
class Polyphonic():
def __init__(self):
with open(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'polyphonic.yaml'),
'r',
encoding='utf-8') as polyphonic_file:
# 解析yaml
polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
self.polyphonic_words = polyphonic_dict["polyphonic"]
def correct_pronunciation(self, word, pinyin):
# 词汇被词典收录则返回纠正后的读音
if word in self.polyphonic_words.keys():
pinyin = self.polyphonic_words[word]
# 否则返回原读音
return pinyin
......@@ -47,4 +47,8 @@ polyphonic:
恶行: ['e4','xing2']
: ['ai4']
扎实: ['zha1','shi2']
干将: ['gan4','jiang4']
\ No newline at end of file
干将: ['gan4','jiang4']
陈威行: ['chen2', 'wei1', 'hang2']
郭晟: ['guo1', 'sheng4']
中标: ['zhong4', 'biao1']
抗住: ['kang2', 'zhu4']
\ No newline at end of file
......@@ -29,7 +29,7 @@ class SingFrontend():
pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line.
phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line.
"""
self.punc = '[:,;。?!“”‘’\':,;.?!]'
self.punc = '[:,;。?!“”‘’\':,;.?!]'
self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'}
if pinyin_phone_path:
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# -*- coding: utf-8 -*-
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import xml.dom.minidom
import xml.parsers.expat
......@@ -17,7 +30,6 @@ Note: xml 有5种特殊字符, &<>"'
' &apos;
例如:
<TitleName>&quot;姓名&quot;</TitleName>
'''
......@@ -61,17 +73,29 @@ class MixTextProcessor():
patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
mat = re.match(patn, mixstr)
if mat:
# pre <speak>
pre_xml = mat.group(1)
# between <speak> ... </speak>
in_xml = mat.group(2)
# post </speak>
after_xml = mat.group(3)
ctlist.append([pre_xml, []])
# pre with none syllable
if pre_xml:
ctlist.append([pre_xml, []])
# between with syllable
# [(sub sentence, [syllables]), ...]
dom = DomXml(in_xml)
pinyinlist = dom.get_pinyins_for_xml()
ctlist = ctlist + pinyinlist
ctlist.append([after_xml, []])
# post with none syllable
if after_xml:
ctlist.append([after_xml, []])
else:
ctlist.append([mixstr, []])
return ctlist
@classmethod
......@@ -86,17 +110,21 @@ class MixTextProcessor():
in_xml = mat.group(2)
after_xml = mat.group(3)
ctlist.append(pre_xml)
if pre_xml:
ctlist.append(pre_xml)
dom = DomXml(in_xml)
tags = dom.get_text_and_sayas_tags()
ctlist.extend(tags)
ctlist.append(after_xml)
return ctlist
if after_xml:
ctlist.append(after_xml)
else:
ctlist.append(mixstr)
return ctlist
class DomXml():
def __init__(self, xmlstr):
self.tdom = parseString(xmlstr) #Document
......
......@@ -20,6 +20,9 @@ from pypinyin import Style
class ToneSandhi():
def __repr__(self):
return "MandarinToneSandhi"
def __init__(self):
self.must_neural_tone_words = {
'麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
......@@ -65,9 +68,22 @@ class ToneSandhi():
'男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
'幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得',
'耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打',
'考考', '整整', '莘莘', '落地', '算子', '家家户户'
'考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青'
}
self.punc = ":,;。?!“”‘’':,;.?!"
self.punc = "、:,;。?!“”‘’':,;.?!"
def _split_word(self, word: str) -> List[str]:
word_list = jieba.cut_for_search(word)
word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
first_subword = word_list[0]
first_begin_idx = word.find(first_subword)
if first_begin_idx == 0:
second_subword = word[len(first_subword):]
new_word_list = [first_subword, second_subword]
else:
second_subword = word[:-len(first_subword)]
new_word_list = [second_subword, first_subword]
return new_word_list
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
# e.g.
......@@ -154,18 +170,8 @@ class ToneSandhi():
finals[i] = finals[i][:-1] + "4"
return finals
def _split_word(self, word: str) -> List[str]:
word_list = jieba.cut_for_search(word)
word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
first_subword = word_list[0]
first_begin_idx = word.find(first_subword)
if first_begin_idx == 0:
second_subword = word[len(first_subword):]
new_word_list = [first_subword, second_subword]
else:
second_subword = word[:-len(first_subword)]
new_word_list = [second_subword, first_subword]
return new_word_list
def _all_tone_three(self, finals: List[str]) -> bool:
return all(x[-1] == "3" for x in finals)
def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
......@@ -207,9 +213,6 @@ class ToneSandhi():
return finals
def _all_tone_three(self, finals: List[str]) -> bool:
return all(x[-1] == "3" for x in finals)
# merge "不" and the word behind it
# if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
......@@ -336,6 +339,9 @@ class ToneSandhi():
def pre_merge_for_modify(
self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
"""
seg: [(word, pos), ...]
"""
seg = self._merge_bu(seg)
seg = self._merge_yi(seg)
seg = self._merge_reduplication(seg)
......@@ -346,7 +352,11 @@ class ToneSandhi():
def modified_tone(self, word: str, pos: str,
finals: List[str]) -> List[str]:
"""
word: 分词
pos: 词性
finals: 带调韵母, [final1, ..., finaln]
"""
finals = self._bu_sandhi(word, finals)
finals = self._yi_sandhi(word, finals)
finals = self._neural_sandhi(word, pos, finals)
......
......@@ -40,6 +40,8 @@ base = [
"hyperpyyaml",
"inflect",
"jsonlines",
# paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
"numpy==1.23.5",
"librosa==0.8.1",
"scipy>=1.4.0",
"loguru",
......@@ -260,6 +262,7 @@ setup_info = dict(
long_description=read("README.md"),
long_description_content_type="text/markdown",
keywords=[
"SSL"
"speech",
"asr",
"tts",
......@@ -268,12 +271,19 @@ setup_info = dict(
"text frontend",
"MFA",
"paddlepaddle",
"paddleaudio",
"streaming asr",
"streaming tts",
"beam search",
"ctcdecoder",
"deepspeech2",
"wav2vec2",
"hubert",
"wavlm",
"transformer",
"conformer",
"fastspeech2",
"hifigan",
"gan vocoders",
],
python_requires='>=3.7',
......
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend
if __name__ == '__main__':
fe = EnFrontend()
text = "AI for Sceience"
phonemes = fe.phoneticize(text)
print(text)
print(phonemes)
text = "eight"
phonemes = fe.phoneticize(text)
print(text)
print(phonemes)
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import tempfile
from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
# mix zh & en phonemes
phone_id_str = """
<pad> 0
<unk> 1
AA0 2
AA1 3
AA2 4
AE0 5
AE1 6
AE2 7
AH0 8
AH1 9
AH2 10
AO0 11
AO1 12
AO2 13
AW0 14
AW1 15
AW2 16
AY0 17
AY1 18
AY2 19
B 20
CH 21
D 22
DH 23
EH0 24
EH1 25
EH2 26
ER0 27
ER1 28
ER2 29
EY0 30
EY1 31
EY2 32
F 33
G 34
HH 35
IH0 36
IH1 37
IH2 38
IY0 39
IY1 40
IY2 41
JH 42
K 43
L 44
M 45
N 46
NG 47
OW0 48
OW1 49
OW2 50
OY0 51
OY1 52
OY2 53
P 54
R 55
S 56
SH 57
T 58
TH 59
UH0 60
UH1 61
UH2 62
UW0 63
UW1 64
UW2 65
V 66
W 67
Y 68
Z 69
ZH 70
a1 71
a2 72
a3 73
a4 74
a5 75
ai1 76
ai2 77
ai3 78
ai4 79
ai5 80
air2 81
air3 82
air4 83
an1 84
an2 85
an3 86
an4 87
an5 88
ang1 89
ang2 90
ang3 91
ang4 92
ang5 93
angr2 94
angr4 95
anr1 96
anr3 97
anr4 98
ao1 99
ao2 100
ao3 101
ao4 102
ao5 103
aor1 104
aor3 105
aor4 106
aor5 107
ar2 108
ar3 109
ar4 110
ar5 111
b 112
c 113
ch 114
d 115
e1 116
e2 117
e3 118
e4 119
e5 120
ei1 121
ei2 122
ei3 123
ei4 124
ei5 125
eir4 126
en1 127
en2 128
en3 129
en4 130
en5 131
eng1 132
eng2 133
eng3 134
eng4 135
eng5 136
engr4 137
enr1 138
enr2 139
enr3 140
enr4 141
enr5 142
er1 143
er2 144
er3 145
er4 146
er5 147
f 148
g 149
h 150
i1 151
i2 152
i3 153
i4 154
i5 155
ia1 156
ia2 157
ia3 158
ia4 159
ia5 160
ian1 161
ian2 162
ian3 163
ian4 164
ian5 165
iang1 166
iang2 167
iang3 168
iang4 169
iang5 170
iangr4 171
ianr1 172
ianr2 173
ianr3 174
ianr4 175
ianr5 176
iao1 177
iao2 178
iao3 179
iao4 180
iao5 181
iaor1 182
iaor2 183
iaor3 184
iaor4 185
iar1 186
iar3 187
iar4 188
ie1 189
ie2 190
ie3 191
ie4 192
ie5 193
ii1 194
ii2 195
ii3 196
ii4 197
ii5 198
iii1 199
iii2 200
iii3 201
iii4 202
iii5 203
iiir1 204
iiir4 205
iir2 206
in1 207
in2 208
in3 209
in4 210
in5 211
ing1 212
ing2 213
ing3 214
ing4 215
ing5 216
ingr1 217
ingr2 218
ingr3 219
ingr4 220
inr1 221
inr4 222
io1 223
io3 224
io5 225
iong1 226
iong2 227
iong3 228
iong4 229
iong5 230
iou1 231
iou2 232
iou3 233
iou4 234
iou5 235
iour1 236
iour2 237
iour3 238
iour4 239
ir1 240
ir2 241
ir3 242
ir4 243
ir5 244
j 245
k 246
l 247
m 248
n 249
o1 250
o2 251
o3 252
o4 253
o5 254
ong1 255
ong2 256
ong3 257
ong4 258
ong5 259
ongr4 260
or2 261
ou1 262
ou2 263
ou3 264
ou4 265
ou5 266
our2 267
our3 268
our4 269
our5 270
p 271
q 272
r 273
s 274
sh 275
sil 276
sp 277
spl 278
spn 279
t 280
u1 281
u2 282
u3 283
u4 284
u5 285
ua1 286
ua2 287
ua3 288
ua4 289
ua5 290
uai1 291
uai2 292
uai3 293
uai4 294
uai5 295
uair4 296
uan1 297
uan2 298
uan3 299
uan4 300
uan5 301
uang1 302
uang2 303
uang3 304
uang4 305
uang5 306
uangr4 307
uanr1 308
uanr2 309
uanr3 310
uanr4 311
uanr5 312
uar1 313
uar2 314
uar4 315
uei1 316
uei2 317
uei3 318
uei4 319
uei5 320
ueir1 321
ueir2 322
ueir3 323
ueir4 324
uen1 325
uen2 326
uen3 327
uen4 328
uen5 329
ueng1 330
ueng2 331
ueng3 332
ueng4 333
uenr1 334
uenr2 335
uenr3 336
uenr4 337
uo1 338
uo2 339
uo3 340
uo4 341
uo5 342
uor1 343
uor2 344
uor3 345
uor5 346
ur1 347
ur2 348
ur3 349
ur4 350
ur5 351
v1 352
v2 353
v3 354
v4 355
v5 356
van1 357
van2 358
van3 359
van4 360
van5 361
vanr1 362
vanr2 363
vanr3 364
vanr4 365
ve1 366
ve2 367
ve3 368
ve4 369
ve5 370
ver3 371
ver4 372
vn1 373
vn2 374
vn3 375
vn4 376
vn5 377
vnr2 378
vr3 379
x 380
z 381
zh 382
, 383
. 384
? 385
! 386
<eos> 387
"""
if __name__ == '__main__':
with tempfile.NamedTemporaryFile(mode='wt') as f:
phone_ids = phone_id_str.split()
for phone, id in zip(phone_ids[::2], phone_ids[1::2]):
f.write(f"{phone} {id}")
f.write('\n')
f.flush()
frontend = MixFrontend(phone_vocab_path=f.name)
text = "hello, 我爱北京天安们,what about you."
print(text)
# [('hello, ', 'en'), ('我爱北京天安们,', 'zh'), ('what about you.', 'en')]
segs = frontend.split_by_lang(text)
print(segs)
text = "hello?!!我爱北京天安们,what about you."
print(text)
# [('hello?!!', 'en'), ('我爱北京天安们,', 'zh'), ('what about you.', 'en')]
segs = frontend.split_by_lang(text)
print(segs)
text = "<speak> hello,我爱北京天安们,what about you."
print(text)
# [('<speak> hello,', 'en'), ('我爱北京天安们,', 'zh'), ('what about you.', 'en')]
segs = frontend.split_by_lang(text)
print(segs)
# 对于SSML的xml标记处理不好。需要先解析SSML,后处理中英的划分。
text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
print(text)
# [('<speak>', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土</', 'zh'), ('say-as>。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干</', 'zh'), ('say-as>', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('了, 里面有个干尸,不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干</', 'zh'), ('say-as>', 'en'), ('死的。</', 'zh'), ('speak>', 'en')]
segs = frontend.split_by_lang(text)
print(segs)
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
if __name__ == '__main__':
text = "你好吗,<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>thank you."
# SSML: 13
# 0 ['你好吗,', []]
# 1 ['我们的声学模型使用了FastSpeechTwo。前浪', []]
# 2 ['倒', ['dao3']]
# 3 ['在沙滩上,沙滩上倒了一堆', []]
# 4 ['土', ['tu3']]
# 5 ['。想象', []]
# 6 ['干干', ['gan1', 'gan1']]
# 7 ['的树干', []]
# 8 ['倒', ['dao3']]
# 9 ['了,里面有个干尸,不知是被谁', []]
# 10 ['干', ['gan4']]
# 11 ['死的。', []]
# 12 ['thank you.', []]
inputs = MixTextProcessor.get_pinyin_split(text)
print(f"SSML get_pinyin_split: {len(inputs)}")
for i, sub in enumerate(inputs):
print(i, sub)
print()
# SSML get_dom_split: 13
# 0 你好吗,
# 1 我们的声学模型使用了 Fast Speech Two。前浪
# 2 <say-as pinyin="dao3">倒</say-as>
# 3 在沙滩上,沙滩上倒了一堆
# 4 <say-as pinyin="tu3">土</say-as>
# 5 。 想象
# 6 <say-as pinyin="gan1 gan1">干干</say-as>
# 7 的树干
# 8 <say-as pinyin="dao3">倒</say-as>
# 9 了, 里面有个干尸,不知是被谁
# 10 <say-as pinyin="gan4">干</say-as>
# 11 死的。
# 12 thank you.
inputs = MixTextProcessor.get_dom_split(text)
print(f"SSML get_dom_split: {len(inputs)}")
for i, sub in enumerate(inputs):
print(i, sub)
print()
# SSML object.get_pinyin_split: 246
# <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
outs = MixTextProcessor().get_xml_content(text)
print(f"SSML object.get_pinyin_split: {len(outs)}")
print(outs)
print()
# SSML object.get_content_split: 30 你好吗,
# 1 <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>
# 倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
# 2 thank you.
outs = MixTextProcessor().get_content_split(text)
print(f"SSML object.get_content_split: {len(outs)}")
for i, sub in enumerate(outs):
print(i, sub)
print()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册