提交 42f2186d 编写于 作者: H Hui Zhang

more comment on tts frontend

上级 8aa9790c
...@@ -99,14 +99,23 @@ def norm(data, mean, std): ...@@ -99,14 +99,23 @@ def norm(data, mean, std):
return (data - mean) / std return (data - mean) / std
def get_chunks(data, block_size: int, pad_size: int): def get_chunks(mel, chunk_size: int, pad_size: int):
data_len = data.shape[1] """
Split mel by chunk size with left and right context.
Args:
mel (paddle.Tensor): mel spectrogram, shape (B, T, D)
chunk_size (int): chunk size
pad_size (int): size for left and right context.
"""
T = mel.shape[1]
n = math.ceil(T / chunk_size)
chunks = [] chunks = []
n = math.ceil(data_len / block_size)
for i in range(n): for i in range(n):
start = max(0, i * block_size - pad_size) start = max(0, i * chunk_size - pad_size)
end = min((i + 1) * block_size + pad_size, data_len) end = min((i + 1) * chunk_size + pad_size, T)
chunks.append(data[:, start:end, :]) chunks.append(mel[:, start:end, :])
return chunks return chunks
...@@ -117,14 +126,10 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): ...@@ -117,14 +126,10 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
with open(text_file, 'rt', encoding='utf-8') as f: with open(text_file, 'rt', encoding='utf-8') as f:
for line in f: for line in f:
if line.strip() != "": if line.strip() != "":
items = re.split(r"\s+", line.strip(), 1) items = re.split(r"\s+", line.strip(), maxsplit=1)
assert len(items) == 2
utt_id = items[0] utt_id = items[0]
if lang in {'zh', 'canton'}: sentence = items[1]
sentence = "".join(items[1:])
elif lang == 'en':
sentence = " ".join(items[1:])
elif lang == 'mix':
sentence = " ".join(items[1:])
sentences.append((utt_id, sentence)) sentences.append((utt_id, sentence))
return sentences return sentences
...@@ -319,6 +324,7 @@ def run_frontend( ...@@ -319,6 +324,7 @@ def run_frontend(
input_ids = {} input_ids = {}
if text.strip() != "" and re.match(r".*?<speak>.*?</speak>.*", text, if text.strip() != "" and re.match(r".*?<speak>.*?</speak>.*", text,
re.DOTALL): re.DOTALL):
# using ssml
input_ids = frontend.get_input_ids_ssml( input_ids = frontend.get_input_ids_ssml(
text, text,
merge_sentences=merge_sentences, merge_sentences=merge_sentences,
...@@ -359,6 +365,7 @@ def run_frontend( ...@@ -359,6 +365,7 @@ def run_frontend(
outs.update({'is_slurs': is_slurs}) outs.update({'is_slurs': is_slurs})
else: else:
print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!") print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!")
outs.update({'phone_ids': phone_ids}) outs.update({'phone_ids': phone_ids})
return outs return outs
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import argparse import argparse
from pathlib import Path from pathlib import Path
from pprint import pprint
import paddle import paddle
import soundfile as sf import soundfile as sf
...@@ -78,6 +79,7 @@ def evaluate(args): ...@@ -78,6 +79,7 @@ def evaluate(args):
# whether dygraph to static # whether dygraph to static
if args.inference_dir: if args.inference_dir:
print("convert am and voc to static model.")
# acoustic model # acoustic model
am_inference = am_to_static( am_inference = am_to_static(
am_inference=am_inference, am_inference=am_inference,
...@@ -92,6 +94,7 @@ def evaluate(args): ...@@ -92,6 +94,7 @@ def evaluate(args):
output_dir = Path(args.output_dir) output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
merge_sentences = False merge_sentences = False
# Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
# but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
...@@ -102,12 +105,18 @@ def evaluate(args): ...@@ -102,12 +105,18 @@ def evaluate(args):
if am_name == 'speedyspeech': if am_name == 'speedyspeech':
get_tone_ids = True get_tone_ids = True
# wav samples
N = 0 N = 0
# inference time cost
T = 0 T = 0
# [(uid, text), ]
if am_name == 'diffsinger': if am_name == 'diffsinger':
sentences = get_sentences_svs(text_file=args.text) sentences = get_sentences_svs(text_file=args.text)
else: else:
sentences = get_sentences(text_file=args.text, lang=args.lang) sentences = get_sentences(text_file=args.text, lang=args.lang)
pprint(f"inputs: {sentences}")
for utt_id, sentence in sentences: for utt_id, sentence in sentences:
with timer() as t: with timer() as t:
if am_name == "diffsinger": if am_name == "diffsinger":
...@@ -116,6 +125,8 @@ def evaluate(args): ...@@ -116,6 +125,8 @@ def evaluate(args):
else: else:
text = sentence text = sentence
svs_input = None svs_input = None
# frontend
frontend_dict = run_frontend( frontend_dict = run_frontend(
frontend=frontend, frontend=frontend,
text=text, text=text,
...@@ -124,25 +135,33 @@ def evaluate(args): ...@@ -124,25 +135,33 @@ def evaluate(args):
lang=args.lang, lang=args.lang,
svs_input=svs_input) svs_input=svs_input)
phone_ids = frontend_dict['phone_ids'] phone_ids = frontend_dict['phone_ids']
# pprint(f"process: {utt_id} {phone_ids}")
with paddle.no_grad(): with paddle.no_grad():
flags = 0 flags = 0
for i in range(len(phone_ids)): for i in range(len(phone_ids)):
# sub phone, split by `sp` or punctuation.
part_phone_ids = phone_ids[i] part_phone_ids = phone_ids[i]
# acoustic model # acoustic model
if am_name == 'fastspeech2': if am_name == 'fastspeech2':
# multi speaker # multi speaker
if am_dataset in {"aishell3", "vctk", "mix", "canton"}: if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
# multi-speaker
spk_id = paddle.to_tensor(args.spk_id) spk_id = paddle.to_tensor(args.spk_id)
mel = am_inference(part_phone_ids, spk_id) mel = am_inference(part_phone_ids, spk_id)
else: else:
# single-speaker
mel = am_inference(part_phone_ids) mel = am_inference(part_phone_ids)
elif am_name == 'speedyspeech': elif am_name == 'speedyspeech':
part_tone_ids = frontend_dict['tone_ids'][i] part_tone_ids = frontend_dict['tone_ids'][i]
if am_dataset in {"aishell3", "vctk", "mix"}: if am_dataset in {"aishell3", "vctk", "mix"}:
# multi-speaker
spk_id = paddle.to_tensor(args.spk_id) spk_id = paddle.to_tensor(args.spk_id)
mel = am_inference(part_phone_ids, part_tone_ids, mel = am_inference(part_phone_ids, part_tone_ids,
spk_id) spk_id)
else: else:
# single-speaker
mel = am_inference(part_phone_ids, part_tone_ids) mel = am_inference(part_phone_ids, part_tone_ids)
elif am_name == 'tacotron2': elif am_name == 'tacotron2':
mel = am_inference(part_phone_ids) mel = am_inference(part_phone_ids)
...@@ -155,6 +174,7 @@ def evaluate(args): ...@@ -155,6 +174,7 @@ def evaluate(args):
note=part_note_ids, note=part_note_ids,
note_dur=part_note_durs, note_dur=part_note_durs,
is_slur=part_is_slurs, ) is_slur=part_is_slurs, )
# vocoder # vocoder
wav = voc_inference(mel) wav = voc_inference(mel)
if flags == 0: if flags == 0:
...@@ -162,17 +182,23 @@ def evaluate(args): ...@@ -162,17 +182,23 @@ def evaluate(args):
flags = 1 flags = 1
else: else:
wav_all = paddle.concat([wav_all, wav]) wav_all = paddle.concat([wav_all, wav])
wav = wav_all.numpy() wav = wav_all.numpy()
N += wav.size N += wav.size
T += t.elapse T += t.elapse
# samples per second
speed = wav.size / t.elapse speed = wav.size / t.elapse
# generate one second wav need `RTF` seconds
rtf = am_config.fs / speed rtf = am_config.fs / speed
print( print(
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
) )
sf.write( sf.write(
str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs) str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
print(f"{utt_id} done!") print(f"{utt_id} done!")
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddlespeech.t2s.frontend.phonectic import Phonetics
""" """
A phonology system with ARPABET symbols and limited punctuations. The G2P A phonology system with ARPABET symbols and limited punctuations. The G2P
conversion is done by g2p_en. conversion is done by g2p_en.
...@@ -19,55 +18,68 @@ conversion is done by g2p_en. ...@@ -19,55 +18,68 @@ conversion is done by g2p_en.
Note that g2p_en does not handle words with hypen well. So make sure the input Note that g2p_en does not handle words with hypen well. So make sure the input
sentence is first normalized. sentence is first normalized.
""" """
from paddlespeech.t2s.frontend.vocab import Vocab
from g2p_en import G2p from g2p_en import G2p
from paddlespeech.t2s.frontend.phonectic import Phonetics
from paddlespeech.t2s.frontend.vocab import Vocab
class ARPABET(Phonetics): class ARPABET(Phonetics):
"""A phonology for English that uses ARPABET as the phoneme vocabulary. """A phonology for English that uses ARPABET without stress as the phoneme vocabulary.
47 symbols = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
The current phoneme set contains 39 phonemes, vowels carry a lexical stress marker:
0 — No stress
1 — Primary stress
2 — Secondary stress
Phoneme Set:
Phoneme Example Translation
------- ------- -----------
AA odd AA D
AE at AE T
AH hut HH AH T
AO ought AO T
AW cow K AW
AY hide HH AY D
B be B IY
CH cheese CH IY Z
D dee D IY
DH thee DH IY
EH Ed EH D
ER hurt HH ER T
EY ate EY T
F fee F IY
G green G R IY N
HH he HH IY
IH it IH T
IY eat IY T
JH gee JH IY
K key K IY
L lee L IY
M me M IY
N knee N IY
NG ping P IH NG
OW oat OW T
OY toy T OY
P pee P IY
R read R IY D
S sea S IY
SH she SH IY
T tea T IY
TH theta TH EY T AH
UH hood HH UH D
UW two T UW
V vee V IY
W we W IY
Y yield Y IY L D
Z zee Z IY
ZH seizure S IY ZH ER
See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details. See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
Phoneme Example Translation
------- ------- -----------
AA odd AA D
AE at AE T
AH hut HH AH T
AO ought AO T
AW cow K AW
AY hide HH AY D
B be B IY
CH cheese CH IY Z
D dee D IY
DH thee DH IY
EH Ed EH D
ER hurt HH ER T
EY ate EY T
F fee F IY
G green G R IY N
HH he HH IY
IH it IH T
IY eat IY T
JH gee JH IY
K key K IY
L lee L IY
M me M IY
N knee N IY
NG ping P IH NG
OW oat OW T
OY toy T OY
P pee P IY
R read R IY D
S sea S IY
SH she SH IY
T tea T IY
TH theta TH EY T AH
UH hood HH UH D
UW two T UW
V vee V IY
W we W IY
Y yield Y IY L D
Z zee Z IY
ZH seizure S IY ZH ER
""" """
# 39 phonemes
phonemes = [ phonemes = [
'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER', 'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER',
'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW',
...@@ -76,6 +88,8 @@ class ARPABET(Phonetics): ...@@ -76,6 +88,8 @@ class ARPABET(Phonetics):
] ]
punctuations = [',', '.', '?', '!'] punctuations = [',', '.', '?', '!']
symbols = phonemes + punctuations symbols = phonemes + punctuations
# vowels carry a lexical stress marker:
# 0 unstressed(无重音), 1 primary stress(主重音)和 2 secondary stress(次重音)
_stress_to_no_stress_ = { _stress_to_no_stress_ = {
'AA0': 'AA', 'AA0': 'AA',
'AA1': 'AA', 'AA1': 'AA',
...@@ -124,7 +138,12 @@ class ARPABET(Phonetics): ...@@ -124,7 +138,12 @@ class ARPABET(Phonetics):
'UW2': 'UW' 'UW2': 'UW'
} }
def __repr__(self):
fmt = "ARPABETWithoutStress(phonemes: {}, punctuations: {})"
return fmt.format(len(phonemes), punctuations)
def __init__(self): def __init__(self):
# https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
self.backend = G2p() self.backend = G2p()
self.vocab = Vocab(self.phonemes + self.punctuations) self.vocab = Vocab(self.phonemes + self.punctuations)
...@@ -139,6 +158,7 @@ class ARPABET(Phonetics): ...@@ -139,6 +158,7 @@ class ARPABET(Phonetics):
Returns: Returns:
List[str]: The list of pronunciation sequence. List[str]: The list of pronunciation sequence.
""" """
# g2p and remove vowel stress
phonemes = [ phonemes = [
self._remove_vowels(item) for item in self.backend(sentence) self._remove_vowels(item) for item in self.backend(sentence)
] ]
...@@ -158,6 +178,7 @@ class ARPABET(Phonetics): ...@@ -158,6 +178,7 @@ class ARPABET(Phonetics):
Returns: Returns:
List[int]: The list of pronunciation id sequence. List[int]: The list of pronunciation id sequence.
""" """
# phonemes to ids
ids = [self.vocab.lookup(item) for item in phonemes] ids = [self.vocab.lookup(item) for item in phonemes]
return ids return ids
...@@ -189,11 +210,16 @@ class ARPABET(Phonetics): ...@@ -189,11 +210,16 @@ class ARPABET(Phonetics):
def vocab_size(self): def vocab_size(self):
""" Vocab size. """ Vocab size.
""" """
# 47 = 39 phones + 4 punctuations + 4 special tokens # 47 = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
return len(self.vocab) return len(self.vocab)
class ARPABETWithStress(Phonetics): class ARPABETWithStress(Phonetics):
"""
A phonology for English that uses ARPABET with stress as the phoneme vocabulary.
77 symbols = 69 phones + 4 punctuations + 4 special tokens
"""
phonemes = [ phonemes = [
'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0', 'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
...@@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics): ...@@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics):
punctuations = [',', '.', '?', '!'] punctuations = [',', '.', '?', '!']
symbols = phonemes + punctuations symbols = phonemes + punctuations
def __repr__(self):
fmt = "ARPABETWithStress(phonemes: {}, punctuations: {})"
return fmt.format(len(phonemes), punctuations)
def __init__(self): def __init__(self):
self.backend = G2p() self.backend = G2p()
self.vocab = Vocab(self.phonemes + self.punctuations) self.vocab = Vocab(self.phonemes + self.punctuations)
......
...@@ -47,4 +47,5 @@ polyphonic: ...@@ -47,4 +47,5 @@ polyphonic:
恶行: ['e4','xing2'] 恶行: ['e4','xing2']
: ['ai4'] : ['ai4']
扎实: ['zha1','shi2'] 扎实: ['zha1','shi2']
干将: ['gan4','jiang4'] 干将: ['gan4','jiang4']
\ No newline at end of file 陈威行: ['chen2', 'wei1', 'hang2']
\ No newline at end of file
...@@ -90,13 +90,14 @@ class MixTextProcessor(): ...@@ -90,13 +90,14 @@ class MixTextProcessor():
dom = DomXml(in_xml) dom = DomXml(in_xml)
tags = dom.get_text_and_sayas_tags() tags = dom.get_text_and_sayas_tags()
ctlist.extend(tags) ctlist.extend(tags)
ctlist.append(after_xml) ctlist.append(after_xml)
return ctlist return ctlist
else: else:
ctlist.append(mixstr) ctlist.append(mixstr)
return ctlist return ctlist
class DomXml(): class DomXml():
def __init__(self, xmlstr): def __init__(self, xmlstr):
self.tdom = parseString(xmlstr) #Document self.tdom = parseString(xmlstr) #Document
......
...@@ -20,6 +20,9 @@ from pypinyin import Style ...@@ -20,6 +20,9 @@ from pypinyin import Style
class ToneSandhi(): class ToneSandhi():
def __repr__(self):
return "MandarinToneSandhi"
def __init__(self): def __init__(self):
self.must_neural_tone_words = { self.must_neural_tone_words = {
'麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝', '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
...@@ -69,6 +72,19 @@ class ToneSandhi(): ...@@ -69,6 +72,19 @@ class ToneSandhi():
} }
self.punc = ":,;。?!“”‘’':,;.?!" self.punc = ":,;。?!“”‘’':,;.?!"
def _split_word(self, word: str) -> List[str]:
word_list = jieba.cut_for_search(word)
word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
first_subword = word_list[0]
first_begin_idx = word.find(first_subword)
if first_begin_idx == 0:
second_subword = word[len(first_subword):]
new_word_list = [first_subword, second_subword]
else:
second_subword = word[:-len(first_subword)]
new_word_list = [second_subword, first_subword]
return new_word_list
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041 # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
# e.g. # e.g.
# word: "家里" # word: "家里"
...@@ -154,18 +170,8 @@ class ToneSandhi(): ...@@ -154,18 +170,8 @@ class ToneSandhi():
finals[i] = finals[i][:-1] + "4" finals[i] = finals[i][:-1] + "4"
return finals return finals
def _split_word(self, word: str) -> List[str]: def _all_tone_three(self, finals: List[str]) -> bool:
word_list = jieba.cut_for_search(word) return all(x[-1] == "3" for x in finals)
word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
first_subword = word_list[0]
first_begin_idx = word.find(first_subword)
if first_begin_idx == 0:
second_subword = word[len(first_subword):]
new_word_list = [first_subword, second_subword]
else:
second_subword = word[:-len(first_subword)]
new_word_list = [second_subword, first_subword]
return new_word_list
def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
...@@ -207,9 +213,6 @@ class ToneSandhi(): ...@@ -207,9 +213,6 @@ class ToneSandhi():
return finals return finals
def _all_tone_three(self, finals: List[str]) -> bool:
return all(x[-1] == "3" for x in finals)
# merge "不" and the word behind it # merge "不" and the word behind it
# if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
...@@ -336,6 +339,9 @@ class ToneSandhi(): ...@@ -336,6 +339,9 @@ class ToneSandhi():
def pre_merge_for_modify( def pre_merge_for_modify(
self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
"""
seg: [(word, pos), ...]
"""
seg = self._merge_bu(seg) seg = self._merge_bu(seg)
seg = self._merge_yi(seg) seg = self._merge_yi(seg)
seg = self._merge_reduplication(seg) seg = self._merge_reduplication(seg)
...@@ -346,7 +352,11 @@ class ToneSandhi(): ...@@ -346,7 +352,11 @@ class ToneSandhi():
def modified_tone(self, word: str, pos: str, def modified_tone(self, word: str, pos: str,
finals: List[str]) -> List[str]: finals: List[str]) -> List[str]:
"""
word: 分词
pos: 词性
finals: 带调韵母, [final1, ..., finaln]
"""
finals = self._bu_sandhi(word, finals) finals = self._bu_sandhi(word, finals)
finals = self._yi_sandhi(word, finals) finals = self._yi_sandhi(word, finals)
finals = self._neural_sandhi(word, pos, finals) finals = self._neural_sandhi(word, pos, finals)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册