Merge pull request #1040 from yt605155624/fix_frontend

[TTS]update text frontend

Merge pull request #1040 from yt605155624/fix_frontend
[TTS]update text frontend
022f1ce8 · 小湉湉 · GitHub · 32afa23e · a861e56e · 022f1ce8
13 changed file
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@@ -34,7 +34,9 @@ def evaluate(args, fastspeech2_config, pwg_config):
    sentences = []
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    with open(args.phones_dict, "r") as f:

--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -137,4 +137,4 @@ pwg_ljspeech_ckpt_0.5
 └── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
 ```
 ## Acknowledgement
 We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
\ No newline at end of file
--- a/paddlespeech/t2s/exps/fastspeech2/inference.py
+++ b/paddlespeech/t2s/exps/fastspeech2/inference.py
@@ -82,7 +82,9 @@ def main():
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    for utt_id, sentence in sentences:

--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
@@ -37,7 +37,9 @@ def evaluate(args, fastspeech2_config, pwg_config):
    sentences = []
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    with open(args.phones_dict, "r") as f:

--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
@@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, pwg_config):
    sentences = []
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    with open(args.phones_dict, "r") as f:

--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
@@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, melgan_config):
    sentences = []
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    with open(args.phones_dict, "r") as f:

--- a/paddlespeech/t2s/exps/speedyspeech/inference.py
+++ b/paddlespeech/t2s/exps/speedyspeech/inference.py
@@ -87,7 +87,9 @@ def main():
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    for utt_id, sentence in sentences:

--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
@@ -40,7 +40,9 @@ def evaluate(args, speedyspeech_config, pwg_config):
    sentences = []
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    with open(args.phones_dict, "r") as f:

--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -129,6 +129,8 @@ class Frontend():
                # we discriminate i, ii and iii
                if c and c not in self.punc:
                    phones.append(c)
+                if c and c in self.punc:
+                    phones.append('sp')
                if v and v not in self.punc:
                    phones.append(v)
            # add sp between sentence (replace the last punc with sp)
@@ -149,9 +151,14 @@ class Frontend():
        if word not in self.must_erhua and (word in self.not_erhua or
                                            pos in {"a", "j", "nr"}):
            return initials, finals
+        # "……" 等情况直接返回
+        if len(finals) != len(word):
+            return initials, finals
+        assert len(finals) == len(word)
        new_initials = []
        new_finals = []
-        assert len(finals) == len(word)
        for i, phn in enumerate(finals):
            if i == len(finals) - 1 and word[i] == "儿" and phn in {
                    "er2", "er5"

--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@@ -32,6 +32,15 @@ RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
                     r':([0-5][0-9])'
                     r'(:([0-5][0-9]))?')
+# 时间范围，如8:30-12:30
+RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
+                           r':([0-5][0-9])'
+                           r'(:([0-5][0-9]))?'
+                           r'(~|-)'
+                           r'([0-1]?[0-9]|2[0-3])'
+                           r':([0-5][0-9])'
+                           r'(:([0-5][0-9]))?')
 def replace_time(match) -> str:
    """
@@ -42,15 +51,32 @@ def replace_time(match) -> str:
    ----------
    str
    """
+    is_range = len(match.groups()) > 5
    hour = match.group(1)
    minute = match.group(2)
    second = match.group(4)
+    if is_range:
+        hour_2 = match.group(6)
+        minute_2 = match.group(7)
+        second_2 = match.group(9)
    result = f"{num2str(hour)}点"
    if minute.lstrip('0'):
        result += f"{_time_num2str(minute)}分"
    if second and second.lstrip('0'):
        result += f"{_time_num2str(second)}秒"
+    if is_range:
+        result += "至"
+        result += f"{num2str(hour_2)}点"
+        if minute_2.lstrip('0'):
+            result += f"{_time_num2str(minute_2)}分"
+        if second_2 and second_2.lstrip('0'):
+            result += f"{_time_num2str(second_2)}秒"
    return result

--- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
@@ -26,16 +26,19 @@ RE_MOBILE_PHONE = re.compile(
 RE_TELEPHONE = re.compile(
    r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{7,8})(?!\d)")
+# 全国统一的号码400开头
+RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
 def phone2str(phone_string: str, mobile=True) -> str:
    if mobile:
        sp_parts = phone_string.strip('+').split()
-        result = ''.join(
+        result = '，'.join(
            [verbalize_digit(part, alt_one=True) for part in sp_parts])
        return result
    else:
        sil_parts = phone_string.split('-')
-        result = ''.join(
+        result = '，'.join(
            [verbalize_digit(part, alt_one=True) for part in sil_parts])
        return result

--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@@ -18,6 +18,7 @@ from .char_convert import tranditional_to_simplified
 from .chronology import RE_DATE
 from .chronology import RE_DATE2
 from .chronology import RE_TIME
+from .chronology import RE_TIME_RANGE
 from .chronology import replace_date
 from .chronology import replace_date2
 from .chronology import replace_time
@@ -40,6 +41,7 @@ from .num import replace_percentage
 from .num import replace_positive_quantifier
 from .num import replace_range
 from .phonecode import RE_MOBILE_PHONE
+from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
 from .phonecode import RE_TELEPHONE
 from .phonecode import replace_mobile
 from .phonecode import replace_phone
@@ -62,6 +64,8 @@ class TextNormalizer():
        List[str]
            Sentences.
        """
+        # Only for pure Chinese here
+        text = text.replace(" ", "")
        text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
        text = text.strip()
        sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
@@ -76,12 +80,19 @@ class TextNormalizer():
        # number related NSW verbalization
        sentence = RE_DATE.sub(replace_date, sentence)
        sentence = RE_DATE2.sub(replace_date2, sentence)
+        # range first
+        sentence = RE_TIME_RANGE.sub(replace_time, sentence)
        sentence = RE_TIME.sub(replace_time, sentence)
        sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
        sentence = RE_FRAC.sub(replace_frac, sentence)
        sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
        sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
        sentence = RE_TELEPHONE.sub(replace_phone, sentence)
+        sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
        sentence = RE_RANGE.sub(replace_range, sentence)
        sentence = RE_INTEGER.sub(replace_negative_num, sentence)
        sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
@@ -94,5 +105,6 @@ class TextNormalizer():
    def normalize(self, text: str) -> List[str]:
        sentences = self._split(text)
        sentences = [self.normalize_sentence(sent) for sent in sentences]
        return sentences
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -307,7 +307,7 @@ class FastSpeech2(nn.Layer):
            num_embeddings=idim,
            embedding_dim=adim,
            padding_idx=self.padding_idx)
        if encoder_type == "transformer":
            print("encoder_type is transformer")
            self.encoder = TransformerEncoder(