diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py index 8f6822e17bf0e73b227ba2f76c8a0e4786a44fb3..6868d3357fef3f69c31c66e43af644e521d28fc3 100644 --- a/paddlespeech/t2s/frontend/mix_frontend.py +++ b/paddlespeech/t2s/frontend/mix_frontend.py @@ -60,9 +60,16 @@ class MixFrontend(): else: return False - def _split(self, text: str) -> List[str]: - text = re.sub(r'[《》【】<=>{}()()#&@“”^_|…\\]', '', text) - # 替换英文句子的句号 "." --> "。" 用于后续分句 + def is_end(self, before_char, after_char) -> bool: + if ((self.is_alphabet(before_char) or before_char == " ") and (self.is_alphabet(after_char) or after_char == " ")): + return True + else: + return False + + def _replace(self, text: str) -> str: + new_text = "" + + # get "." indexs point = "." point_indexs = [] index = -1 @@ -70,23 +77,77 @@ class MixFrontend(): index = text.find(".", index + 1, len(text)) point_indexs.append(index) - print(point_indexs) + # replace "." -> "。" when English sentence ending + if len(point_indexs) == 0: + new_text = text - for point_index in point_indexs: - # 如果点在最开始或者最末尾的位置,不处理 + elif len(point_indexs) == 1: + point_index = point_indexs[0] if point_index == 0 or point_index == len(text) - 1: - pass + new_text = text else: - if ((self.is_alphabet(text[point_index - 1]) or - text[point_index - 1] == " ") and - (self.is_alphabet(text[point_index + 1]) or - text[point_index + 1] == " ")): - text = text.replace(text[point_index], "。") + if not self.is_end(text[point_index - 1], text[point_index + 1]): + new_text = text + else: + new_text = text[: point_index] + "。" + text[point_index + 1:] + elif len(point_indexs) == 2: + first_index = point_indexs[0] + end_index = point_indexs[1] + + # first + if first_index != 0: + if not self.is_end(text[first_index - 1], text[first_index + 1]): + new_text += (text[:first_index] + ".") + else: + new_text += (text[:first_index] + "。") + else: + new_text += "." + # last + if end_index != len(text) - 1: + if not self.is_end(text[end_index - 1], text[end_index + 1]): + new_text += text[point_indexs[-2] + 1 : ] + else: + new_text += (text[point_indexs[-2] + 1 : end_index] + "。" + text[end_index + 1 : ]) + else: + new_text += "." + + else: + first_index = point_indexs[0] + end_index = point_indexs[-1] + # first + if first_index != 0: + if not self.is_end(text[first_index - 1], text[first_index + 1]): + new_text += (text[:first_index] + ".") + else: + new_text += (text[:first_index] + "。") + else: + new_text += "." + # middle + for j in range(1, len(point_indexs) - 1): + point_index = point_indexs[j] + if not self.is_end(text[point_index - 1], text[point_index + 1]): + new_text += (text[point_indexs[j-1] + 1 : point_index] + ".") + else: + new_text += (text[point_indexs[j-1] + 1 : point_index] + "。") + # last + if end_index != len(text) - 1: + if not self.is_end(text[end_index - 1], text[end_index + 1]): + new_text += text[point_indexs[-2] + 1 : ] + else: + new_text += (text[point_indexs[-2] + 1 : end_index] + "。" + text[end_index + 1 : ]) + else: + new_text += "." + + return new_text + + def _split(self, text: str) -> List[str]: + text = re.sub(r'[《》【】<=>{}()()#&@“”^_|…\\]', '', text) + # 替换英文句子的句号 "." --> "。" 用于后续分句 + text = self._replace(text) text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) text = text.strip() sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] - return sentences def _distinguish(self, text: str) -> List[str]: