From a7858551b735594e8c418de5c4807b47cdcfa5cf Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 19 Nov 2021 09:49:38 +0000
Subject: [PATCH] add utt2spk for all dataset

---
 .../dataset/aidatatang_200zh/aidatatang_200zh.py    |  3 +++
 .../dataset/mini_librispeech/mini_librispeech.py    | 13 +++++++------
 examples/dataset/thchs30/thchs30.py                 |  3 +++
 examples/dataset/timit/timit.py                     |  4 ++--
 .../dataset/timit/timit_kaldi_standard_split.py     |  8 ++++++++
 examples/dataset/voxforge/voxforge.py               |  5 ++++-
 6 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
index e32f619e..85f478c2 100644
--- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
@@ -22,6 +22,7 @@ import argparse
 import codecs
 import json
 import os
+from pathlib import Path
 
 import soundfile
 
@@ -79,6 +80,7 @@ def create_manifest(data_dir, manifest_path_prefix):
 
                 audio_path = os.path.abspath(os.path.join(subfolder, fname))
                 audio_id = os.path.basename(fname)[:-4]
+                utt2spk = Path(audio_path).parent.name
 
                 audio_data, samplerate = soundfile.read(audio_path)
                 duration = float(len(audio_data) / samplerate)
@@ -87,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                     json.dumps(
                         {
                             'utt': audio_id,
+                            'utt2spk': str(utt2spk),
                             'feat': audio_path,
                             'feat_shape': (duration, ),  # second
                             'text': text,
diff --git a/examples/dataset/mini_librispeech/mini_librispeech.py b/examples/dataset/mini_librispeech/mini_librispeech.py
index 65fee81a..730c73a8 100644
--- a/examples/dataset/mini_librispeech/mini_librispeech.py
+++ b/examples/dataset/mini_librispeech/mini_librispeech.py
@@ -74,15 +74,16 @@ def create_manifest(data_dir, manifest_path):
                 audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
                 audio_data, samplerate = soundfile.read(audio_filepath)
                 duration = float(len(audio_data)) / samplerate
+
+                utt = os.path.splitext(os.path.basename(audio_filepath))[0]
+                utt2spk = '-'.join(utt.split('-')[:2])
                 json_lines.append(
                     json.dumps({
-                        'utt':
-                        os.path.splitext(os.path.basename(audio_filepath))[0],
-                        'feat':
-                        audio_filepath,
+                        'utt': utt,
+                        'utt2spk': utt2spk,
+                        'feat': audio_filepath,
                         'feat_shape': (duration, ),  #second
-                        'text':
-                        text
+                        'text': text,
                     }))
 
                 total_sec += duration
diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py
index 77a264cb..2ec4ddab 100644
--- a/examples/dataset/thchs30/thchs30.py
+++ b/examples/dataset/thchs30/thchs30.py
@@ -113,6 +113,8 @@ def create_manifest(data_dir, manifest_path_prefix):
                 assert os.path.exists(audio_path) and os.path.exists(text_path)
 
                 audio_id = os.path.basename(audio_path)[:-4]
+                spk = audio_id.split('_')[0]
+
                 word_text, syllable_text, phone_text = read_trn(text_path)
                 audio_data, samplerate = soundfile.read(audio_path)
                 duration = float(len(audio_data) / samplerate)
@@ -122,6 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                     json.dumps(
                         {
                             'utt': audio_id,
+                            'utt2spk', spk,
                             'feat': audio_path,
                             'feat_shape': (duration, ),  # second
                             'text': word_text,  # charactor
diff --git a/examples/dataset/timit/timit.py b/examples/dataset/timit/timit.py
index 311d445c..c4a9f066 100644
--- a/examples/dataset/timit/timit.py
+++ b/examples/dataset/timit/timit.py
@@ -180,12 +180,12 @@ def create_manifest(data_dir, manifest_path_prefix):
                 json.dumps(
                     {
                         'utt': utt_id,
+                        'utt2spk': spk,
+                        'utt2gender': gender,
                         'feat': str(audio_path),
                         'feat_shape': (duration, ),  # second
                         'text': word_text,  # word
                         'phone': phone_text,
-                        'spk': spk,
-                        'gender': gender,
                     },
                     ensure_ascii=False))
 
diff --git a/examples/dataset/timit/timit_kaldi_standard_split.py b/examples/dataset/timit/timit_kaldi_standard_split.py
index 2b494c06..26aa76c7 100644
--- a/examples/dataset/timit/timit_kaldi_standard_split.py
+++ b/examples/dataset/timit/timit_kaldi_standard_split.py
@@ -24,6 +24,7 @@ import json
 import os
 
 import soundfile
+from pathlib import Path
 
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
@@ -67,10 +68,17 @@ def create_manifest(data_dir, manifest_path_prefix):
             audio_data, samplerate = soundfile.read(audio_path)
             duration = float(len(audio_data) / samplerate)
             text = phn_dict[audio_id]
+
+            gender_spk = str(Path(audio_path).parent.stem)
+            spk = gender_spk[1:]
+            gender = gender_spk[0]
+            utt_id = '_'.join([spk, gender, audio_id])
             json_lines.append(
                 json.dumps(
                     {
                         'utt': audio_id,
+                        'utt2spk': spk,
+                        'utt2gender': gender,
                         'feat': audio_path,
                         'feat_shape': (duration, ),  # second
                         'text': text
diff --git a/examples/dataset/voxforge/voxforge.py b/examples/dataset/voxforge/voxforge.py
index 36282bd6..373791bf 100644
--- a/examples/dataset/voxforge/voxforge.py
+++ b/examples/dataset/voxforge/voxforge.py
@@ -175,9 +175,12 @@ def generate_manifest(data_dir, manifest_path):
 
             audio_data, samplerate = soundfile.read(u)
             duration = float(len(audio_data)) / samplerate
+
+            utt = os.path.splitext(os.path.basename(u))[0]
             json_lines.append(
                 json.dumps({
-                    'utt': os.path.splitext(os.path.basename(u))[0],
+                    'utt': utt,
+                    'utt2spk': speaker,
                     'feat': u,
                     'feat_shape': (duration, ),  #second
                     'text': trans.lower()
-- 
GitLab