From f43d0260006e145a5a23dcb0f947ca4c62f83253 Mon Sep 17 00:00:00 2001
From: HuangLiangJie <mailoflawrence@gmail.com>
Date: Wed, 9 Nov 2022 12:30:13 +0800
Subject: [PATCH] Add rhythm tags for MFA, test=tts (#2615)

* Add rhythm tags for MFA, test=tts
---
 examples/other/mfa/README.md                 |  3 +
 examples/other/mfa/local/generate_lexicon.py |  5 ++
 examples/other/mfa/local/reorganize_baker.py | 73 +++++++++++++++++++-
 3 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/examples/other/mfa/README.md b/examples/other/mfa/README.md
index c24524ab..216d1275 100644
--- a/examples/other/mfa/README.md
+++ b/examples/other/mfa/README.md
@@ -4,3 +4,6 @@ Run the following script to get started, for more detail, please see `run.sh`.
 ```bash
 ./run.sh
 ```
+# Rhythm tags for MFA
+If you want to get rhythm tags with duration through MFA tool, you may add flag `--rhy-with-duration` in the first two commands in `run.sh`
+Note that only CSMSC dataset is supported so far, and we replace `#` with `sp` in rhythm tags for MFA.
diff --git a/examples/other/mfa/local/generate_lexicon.py b/examples/other/mfa/local/generate_lexicon.py
index e9445665..3deb2470 100644
--- a/examples/other/mfa/local/generate_lexicon.py
+++ b/examples/other/mfa/local/generate_lexicon.py
@@ -182,12 +182,17 @@ if __name__ == "__main__":
         "--with-tone", action="store_true", help="whether to consider tone.")
     parser.add_argument(
         "--with-r", action="store_true", help="whether to consider erhua.")
+    parser.add_argument(
+        "--rhy-with-duration",
+        action="store_true", )
     args = parser.parse_args()
 
     lexicon = generate_lexicon(args.with_tone, args.with_r)
     symbols = generate_symbols(lexicon)
 
     with open(args.output + ".lexicon", 'wt') as f:
+        if args.rhy_with_duration:
+            f.write("sp1 sp1\nsp2 sp2\nsp3 sp3\nsp4 sp4\n")
         for k, v in lexicon.items():
             f.write(f"{k} {v}\n")
 
diff --git a/examples/other/mfa/local/reorganize_baker.py b/examples/other/mfa/local/reorganize_baker.py
index 153e01d1..0e0035bd 100644
--- a/examples/other/mfa/local/reorganize_baker.py
+++ b/examples/other/mfa/local/reorganize_baker.py
@@ -23,6 +23,7 @@ for more details.
 """
 import argparse
 import os
+import re
 import shutil
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
@@ -32,6 +33,22 @@ import librosa
 import soundfile as sf
 from tqdm import tqdm
 
+repalce_dict = {
+    "；": "",
+    "。": "",
+    "：": "",
+    "—": "",
+    "）": "",
+    "，": "",
+    "“": "",
+    "（": "",
+    "、": "",
+    "…": "",
+    "！": "",
+    "？": "",
+    "”": ""
+}
+
 
 def get_transcripts(path: Union[str, Path]):
     transcripts = {}
@@ -55,9 +72,13 @@ def resample_and_save(source, target, sr=16000):
 
 def reorganize_baker(root_dir: Union[str, Path],
                      output_dir: Union[str, Path]=None,
-                     resample_audio=False):
+                     resample_audio=False,
+                     rhy_dur=False):
     root_dir = Path(root_dir).expanduser()
-    transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt"
+    if rhy_dur:
+        transcript_path = root_dir / "ProsodyLabeling" / "000001-010000_rhy.txt"
+    else:
+        transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt"
     transcriptions = get_transcripts(transcript_path)
 
     wave_dir = root_dir / "Wave"
@@ -92,6 +113,46 @@ def reorganize_baker(root_dir: Union[str, Path],
     print("Done!")
 
 
+def insert_rhy(sentence_first, sentence_second):
+    sub = '#'
+    return_words = []
+    sentence_first = sentence_first.translate(str.maketrans(repalce_dict))
+    rhy_idx = [substr.start() for substr in re.finditer(sub, sentence_first)]
+    re_rhy_idx = []
+    sentence_first_ = sentence_first.replace("#1", "").replace(
+        "#2", "").replace("#3", "").replace("#4", "")
+    sentence_seconds = sentence_second.split(" ")
+    for i, w in enumerate(rhy_idx):
+        re_rhy_idx.append(w - i * 2)
+    i = 0
+    # print("re_rhy_idx: ", re_rhy_idx)
+    for sentence_s in (sentence_seconds):
+        return_words.append(sentence_s)
+        if i < len(re_rhy_idx) and len(return_words) - i == re_rhy_idx[i]:
+            return_words.append("sp" + sentence_first[rhy_idx[i] + 1:rhy_idx[i]
+                                                      + 2])
+            i = i + 1
+    return return_words
+
+
+def normalize_rhy(root_dir: Union[str, Path]):
+    root_dir = Path(root_dir).expanduser()
+    transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt"
+    target_transcript_path = root_dir / "ProsodyLabeling" / "000001-010000_rhy.txt"
+
+    with open(transcript_path) as f:
+        lines = f.readlines()
+
+    with open(target_transcript_path, 'wt') as f:
+        for i in range(0, len(lines), 2):
+            sentence_first = lines[i]  #第一行直接保存
+            f.write(sentence_first)
+            transcription = lines[i + 1].strip()
+            f.write("\t" + " ".join(
+                insert_rhy(sentence_first.split('\t')[1], transcription)) +
+                    "\n")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Reorganize Baker dataset for MFA")
@@ -104,6 +165,12 @@ if __name__ == "__main__":
         "--resample-audio",
         action="store_true",
         help="To resample audio files or just copy them")
+    parser.add_argument(
+        "--rhy-with-duration",
+        action="store_true", )
     args = parser.parse_args()
 
-    reorganize_baker(args.root_dir, args.output_dir, args.resample_audio)
+    if args.rhy_with_duration:
+        normalize_rhy(args.root_dir)
+    reorganize_baker(args.root_dir, args.output_dir, args.resample_audio,
+                     args.rhy_with_duration)
-- 
GitLab