提交 9de5ad63 编写于 作者: Q qingen

[vector] add AMI data preparation scripts

上级 03a57502
results
\ No newline at end of file
# [AMI](https://groups.inf.ed.ac.uk/ami/corpus/)
The AMI Meeting Corpus is a multi-modal data set consisting of 100 hours of meeting recordings. For a gentle introduction to the corpus, see the [corpus overview](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml). To access the data, follow the directions given [there](https://groups.inf.ed.ac.uk/ami/download). Around two-thirds of the data has been elicited using a scenario in which the participants play different roles in a design team, taking a design project from kick-off to completion over the course of a day. The rest consists of naturally occurring meetings in a range of domains.
Detailed information can be found in the [documentation section](http://groups.inf.ed.ac.uk/ami/corpus/datasets.shtml).
"""
Data preparation.
Download: http://groups.inf.ed.ac.uk/ami/download/
Prepares metadata files (JSON) from manual annotations "segments/" using RTTM format (Oracle VAD).
Authors
* qingenz123@126.com (Qingen ZHAO) 2022
"""
import os
import logging
import argparse
import xml.etree.ElementTree as et
import glob
import json
from ami_splits import get_AMI_split
from distutils.util import strtobool
from utils.dataio import (
load_pkl,
save_pkl,
)
logger = logging.getLogger(__name__)
SAMPLERATE = 16000
def prepare_ami(
data_folder,
manual_annot_folder,
save_folder,
ref_rttm_dir,
meta_data_dir,
split_type="full_corpus_asr",
skip_TNO=True,
mic_type="Mix-Headset",
vad_type="oracle",
max_subseg_dur=3.0,
overlap=1.5,
):
"""
Prepares reference RTTM and JSON files for the AMI dataset.
Arguments
---------
data_folder : str
Path to the folder where the original amicorpus is stored.
manual_annot_folder : str
Directory where the manual annotations are stored.
save_folder : str
The save directory in results.
ref_rttm_dir : str
Directory to store reference RTTM files.
meta_data_dir : str
Directory to store the meta data (json) files.
split_type : str
Standard dataset split. See ami_splits.py for more information.
Allowed split_type: "scenario_only", "full_corpus" or "full_corpus_asr"
skip_TNO: bool
Skips TNO meeting recordings if True.
mic_type : str
Type of microphone to be used.
vad_type : str
Type of VAD. Kept for future when VAD will be added.
max_subseg_dur : float
Duration in seconds of a subsegments to be prepared from larger segments.
overlap : float
Overlap duration in seconds between adjacent subsegments
Example
-------
>>> from recipes.AMI.ami_prepare import prepare_ami
>>> data_folder = '/network/datasets/ami/amicorpus/'
>>> manual_annot_folder = '/home/mila/d/dawalatn/nauman/ami_public_manual/'
>>> save_folder = 'results/save/'
>>> split_type = 'full_corpus_asr'
>>> mic_type = 'Lapel'
>>> prepare_ami(data_folder, manual_annot_folder, save_folder, split_type, mic_type)
"""
# Meta files
meta_files = [
os.path.join(meta_data_dir, "ami_train." + mic_type + ".subsegs.json"),
os.path.join(meta_data_dir, "ami_dev." + mic_type + ".subsegs.json"),
os.path.join(meta_data_dir, "ami_eval." + mic_type + ".subsegs.json"),
]
# Create configuration for easily skipping data_preparation stage
conf = {
"data_folder": data_folder,
"save_folder": save_folder,
"ref_rttm_dir": ref_rttm_dir,
"meta_data_dir": meta_data_dir,
"split_type": split_type,
"skip_TNO": skip_TNO,
"mic_type": mic_type,
"vad": vad_type,
"max_subseg_dur": max_subseg_dur,
"overlap": overlap,
"meta_files": meta_files,
}
if not os.path.exists(save_folder):
os.makedirs(save_folder)
# Setting output option files.
opt_file = "opt_ami_prepare." + mic_type + ".pkl"
# Check if this phase is already done (if so, skip it)
if skip(save_folder, conf, meta_files, opt_file):
logger.info(
"Skipping data preparation, as it was completed in previous run."
)
return
msg = "\tCreating meta-data file for the AMI Dataset.."
logger.debug(msg)
# Get the split
train_set, dev_set, eval_set = get_AMI_split(split_type)
# Prepare RTTM from XML(manual annot) and store are groundtruth
# Create ref_RTTM directory
if not os.path.exists(ref_rttm_dir):
os.makedirs(ref_rttm_dir)
# Create reference RTTM files
splits = ["train", "dev", "eval"]
for i in splits:
rttm_file = ref_rttm_dir + "/fullref_ami_" + i + ".rttm"
if i == "train":
prepare_segs_for_RTTM(
train_set,
rttm_file,
data_folder,
manual_annot_folder,
i,
skip_TNO,
)
if i == "dev":
prepare_segs_for_RTTM(
dev_set,
rttm_file,
data_folder,
manual_annot_folder,
i,
skip_TNO,
)
if i == "eval":
prepare_segs_for_RTTM(
eval_set,
rttm_file,
data_folder,
manual_annot_folder,
i,
skip_TNO,
)
# Create meta_files for splits
meta_data_dir = meta_data_dir
if not os.path.exists(meta_data_dir):
os.makedirs(meta_data_dir)
for i in splits:
rttm_file = ref_rttm_dir + "/fullref_ami_" + i + ".rttm"
meta_filename_prefix = "ami_" + i
prepare_metadata(
rttm_file,
meta_data_dir,
data_folder,
meta_filename_prefix,
max_subseg_dur,
overlap,
mic_type,
)
save_opt_file = os.path.join(save_folder, opt_file)
save_pkl(conf, save_opt_file)
def get_RTTM_per_rec(segs, spkrs_list, rec_id):
"""Prepares rttm for each recording
"""
rttm = []
# Prepare header
for spkr_id in spkrs_list:
# e.g. SPKR-INFO ES2008c 0 <NA> <NA> <NA> unknown ES2008c.A_PM <NA> <NA>
line = (
"SPKR-INFO "
+ rec_id
+ " 0 <NA> <NA> <NA> unknown "
+ spkr_id
+ " <NA> <NA>"
)
rttm.append(line)
# Append remaining lines
for row in segs:
# e.g. SPEAKER ES2008c 0 37.880 0.590 <NA> <NA> ES2008c.A_PM <NA> <NA>
if float(row[1]) < float(row[0]):
msg1 = (
"Possibly Incorrect Annotation Found!! transcriber_start (%s) > transcriber_end (%s)"
% (row[0], row[1])
)
msg2 = (
"Excluding this incorrect row from the RTTM : %s, %s, %s, %s"
% (
rec_id,
row[0],
str(round(float(row[1]) - float(row[0]), 4)),
str(row[2]),
)
)
logger.info(msg1)
logger.info(msg2)
continue
line = (
"SPEAKER "
+ rec_id
+ " 0 "
+ str(round(float(row[0]), 4))
+ " "
+ str(round(float(row[1]) - float(row[0]), 4))
+ " <NA> <NA> "
+ str(row[2])
+ " <NA> <NA>"
)
rttm.append(line)
return rttm
def prepare_segs_for_RTTM(
list_ids, out_rttm_file, audio_dir, annot_dir, split_type, skip_TNO
):
RTTM = [] # Stores all RTTMs clubbed together for a given dataset split
for main_meet_id in list_ids:
# Skip TNO meetings from dev and eval sets
if (
main_meet_id.startswith("TS")
and split_type != "train"
and skip_TNO is True
):
msg = (
"Skipping TNO meeting in AMI "
+ str(split_type)
+ " set : "
+ str(main_meet_id)
)
logger.info(msg)
continue
list_sessions = glob.glob(audio_dir + "/" + main_meet_id + "*")
list_sessions.sort()
for sess in list_sessions:
rec_id = os.path.basename(sess)
path = annot_dir + "/segments/" + rec_id
f = path + ".*.segments.xml"
list_spkr_xmls = glob.glob(f)
list_spkr_xmls.sort() # A, B, C, D, E etc (Speakers)
segs = []
spkrs_list = (
[]
) # Since non-scenario recordings contains 3-5 speakers
for spkr_xml_file in list_spkr_xmls:
# Speaker ID
spkr = os.path.basename(spkr_xml_file).split(".")[1]
spkr_ID = rec_id + "." + spkr
spkrs_list.append(spkr_ID)
# Parse xml tree
tree = et.parse(spkr_xml_file)
root = tree.getroot()
# Start, end and speaker_ID from xml file
segs = segs + [
[
elem.attrib["transcriber_start"],
elem.attrib["transcriber_end"],
spkr_ID,
]
for elem in root.iter("segment")
]
# Sort rows as per the start time (per recording)
segs.sort(key=lambda x: float(x[0]))
rttm_per_rec = get_RTTM_per_rec(segs, spkrs_list, rec_id)
RTTM = RTTM + rttm_per_rec
# Write one RTTM as groundtruth. For example, "fullref_eval.rttm"
with open(out_rttm_file, "w") as f:
for item in RTTM:
f.write("%s\n" % item)
def is_overlapped(end1, start2):
"""Returns True if the two segments overlap
Arguments
---------
end1 : float
End time of the first segment.
start2 : float
Start time of the second segment.
"""
if start2 > end1:
return False
else:
return True
def merge_rttm_intervals(rttm_segs):
"""Merges adjacent segments in rttm if they overlap.
"""
# For one recording
# rec_id = rttm_segs[0][1]
rttm_segs.sort(key=lambda x: float(x[3]))
# first_seg = rttm_segs[0] # first interval.. as it is
merged_segs = [rttm_segs[0]]
strt = float(rttm_segs[0][3])
end = float(rttm_segs[0][3]) + float(rttm_segs[0][4])
for row in rttm_segs[1:]:
s = float(row[3])
e = float(row[3]) + float(row[4])
if is_overlapped(end, s):
# Update only end. The strt will be same as in last segment
# Just update last row in the merged_segs
end = max(end, e)
merged_segs[-1][3] = str(round(strt, 4))
merged_segs[-1][4] = str(round((end - strt), 4))
merged_segs[-1][7] = "overlap" # previous_row[7] + '-'+ row[7]
else:
# Add a new disjoint segment
strt = s
end = e
merged_segs.append(row) # this will have 1 spkr ID
return merged_segs
def get_subsegments(merged_segs, max_subseg_dur=3.0, overlap=1.5):
"""Divides bigger segments into smaller sub-segments
"""
shift = max_subseg_dur - overlap
subsegments = []
# These rows are in RTTM format
for row in merged_segs:
seg_dur = float(row[4])
rec_id = row[1]
if seg_dur > max_subseg_dur:
num_subsegs = int(seg_dur / shift)
# Taking 0.01 sec as small step
seg_start = float(row[3])
seg_end = seg_start + seg_dur
# Now divide this segment (new_row) in smaller subsegments
for i in range(num_subsegs):
subseg_start = seg_start + i * shift
subseg_end = min(subseg_start + max_subseg_dur - 0.01, seg_end)
subseg_dur = subseg_end - subseg_start
new_row = [
"SPEAKER",
rec_id,
"0",
str(round(float(subseg_start), 4)),
str(round(float(subseg_dur), 4)),
"<NA>",
"<NA>",
row[7],
"<NA>",
"<NA>",
]
subsegments.append(new_row)
# Break if exceeding the boundary
if subseg_end >= seg_end:
break
else:
subsegments.append(row)
return subsegments
def prepare_metadata(
rttm_file, save_dir, data_dir, filename, max_subseg_dur, overlap, mic_type
):
# Read RTTM, get unique meeting_IDs (from RTTM headers)
# For each MeetingID. select that meetID -> merge -> subsegment -> json -> append
# Read RTTM
RTTM = []
with open(rttm_file, "r") as f:
for line in f:
entry = line[:-1]
RTTM.append(entry)
spkr_info = filter(lambda x: x.startswith("SPKR-INFO"), RTTM)
rec_ids = list(set([row.split(" ")[1] for row in spkr_info]))
rec_ids.sort() # sorting just to make JSON look in proper sequence
# For each recording merge segments and then perform subsegmentation
MERGED_SEGMENTS = []
SUBSEGMENTS = []
for rec_id in rec_ids:
segs_iter = filter(
lambda x: x.startswith("SPEAKER " + str(rec_id)), RTTM
)
gt_rttm_segs = [row.split(" ") for row in segs_iter]
# Merge, subsegment and then convert to json format.
merged_segs = merge_rttm_intervals(
gt_rttm_segs
) # We lose speaker_ID after merging
MERGED_SEGMENTS = MERGED_SEGMENTS + merged_segs
# Divide segments into smaller sub-segments
subsegs = get_subsegments(merged_segs, max_subseg_dur, overlap)
SUBSEGMENTS = SUBSEGMENTS + subsegs
# Write segment AND sub-segments (in RTTM format)
segs_file = save_dir + "/" + filename + ".segments.rttm"
subsegment_file = save_dir + "/" + filename + ".subsegments.rttm"
with open(segs_file, "w") as f:
for row in MERGED_SEGMENTS:
line_str = " ".join(row)
f.write("%s\n" % line_str)
with open(subsegment_file, "w") as f:
for row in SUBSEGMENTS:
line_str = " ".join(row)
f.write("%s\n" % line_str)
# Create JSON from subsegments
json_dict = {}
for row in SUBSEGMENTS:
rec_id = row[1]
strt = str(round(float(row[3]), 4))
end = str(round((float(row[3]) + float(row[4])), 4))
subsegment_ID = rec_id + "_" + strt + "_" + end
dur = row[4]
start_sample = int(float(strt) * SAMPLERATE)
end_sample = int(float(end) * SAMPLERATE)
# If multi-mic audio is selected
if mic_type == "Array1":
wav_file_base_path = (
data_dir
+ "/"
+ rec_id
+ "/audio/"
+ rec_id
+ "."
+ mic_type
+ "-"
)
f = [] # adding all 8 mics
for i in range(8):
f.append(wav_file_base_path + str(i + 1).zfill(2) + ".wav")
audio_files_path_list = f
# Note: key "files" with 's' is used for multi-mic
json_dict[subsegment_ID] = {
"wav": {
"files": audio_files_path_list,
"duration": float(dur),
"start": int(start_sample),
"stop": int(end_sample),
},
}
else:
# Single mic audio
wav_file_path = (
data_dir
+ "/"
+ rec_id
+ "/audio/"
+ rec_id
+ "."
+ mic_type
+ ".wav"
)
# Note: key "file" without 's' is used for single-mic
json_dict[subsegment_ID] = {
"wav": {
"file": wav_file_path,
"duration": float(dur),
"start": int(start_sample),
"stop": int(end_sample),
},
}
out_json_file = save_dir + "/" + filename + "." + mic_type + ".subsegs.json"
with open(out_json_file, mode="w") as json_f:
json.dump(json_dict, json_f, indent=2)
msg = "%s JSON prepared" % (out_json_file)
logger.debug(msg)
def skip(save_folder, conf, meta_files, opt_file):
"""
Detects if the AMI data_preparation has been already done.
If the preparation has been done, we can skip it.
Returns
-------
bool
if True, the preparation phase can be skipped.
if False, it must be done.
"""
# Checking if meta (json) files are available
skip = True
for file_path in meta_files:
if not os.path.isfile(file_path):
skip = False
# Checking saved options
save_opt_file = os.path.join(save_folder, opt_file)
if skip is True:
if os.path.isfile(save_opt_file):
opts_old = load_pkl(save_opt_file)
if opts_old == conf:
skip = True
else:
skip = False
else:
skip = False
return skip
if __name__ == '__main__':
parser = argparse.ArgumentParser(
prog='python ami_prepare.py --data_folder /home/data/ami/amicorpus \
--manual_annot_folder /home/data/ami/ami_public_manual_1.6.2 \
--save_folder ./results/ --ref_rttm_dir ./results/ref_rttms \
--meta_data_dir ./results/metadata',
description='AMI Data preparation')
parser.add_argument(
'--data_folder', required=True, help='Path to the folder where the original amicorpus is stored')
parser.add_argument(
'--manual_annot_folder', required=True, help='Directory where the manual annotations are stored')
parser.add_argument(
'--save_folder', required=True, help='The save directory in results')
parser.add_argument(
'--ref_rttm_dir', required=True, help='Directory to store reference RTTM files')
parser.add_argument(
'--meta_data_dir', required=True, help='Directory to store the meta data (json) files')
parser.add_argument(
'--split_type',
default="full_corpus_asr",
help='Standard dataset split. See ami_splits.py for more information')
parser.add_argument(
'--skip_TNO', default=True, type=strtobool, help='Skips TNO meeting recordings if True')
parser.add_argument(
'--mic_type', default="Mix-Headset", help='Type of microphone to be used')
parser.add_argument(
'--vad_type', default="oracle", help='Type of VAD. Kept for future when VAD will be added')
parser.add_argument(
'--max_subseg_dur',
default=3.0,
type=float,
help='Duration in seconds of a subsegments to be prepared from larger segments')
parser.add_argument(
'--overlap', default=1.5, type=float, help='Overlap duration in seconds between adjacent subsegments')
args = parser.parse_args()
print(args)
prepare_ami(
args.data_folder,
args.manual_annot_folder,
args.save_folder,
args.ref_rttm_dir,
args.meta_data_dir
)
\ No newline at end of file
"""
AMI corpus contained 100 hours of meeting recording.
This script returns the standard train, dev and eval split for AMI corpus.
For more information on dataset please refer to http://groups.inf.ed.ac.uk/ami/corpus/datasets.shtml
Authors
* qingenz123@126.com (Qingen ZHAO) 2022
Credits
"""
ALLOWED_OPTIONS = ["scenario_only", "full_corpus", "full_corpus_asr"]
def get_AMI_split(split_option):
"""
Prepares train, dev, and test sets for given split_option
Arguments
---------
split_option: str
The standard split option.
Allowed options: "scenario_only", "full_corpus", "full_corpus_asr"
Returns
-------
Meeting IDs for train, dev, and test sets for given split_option
"""
if split_option not in ALLOWED_OPTIONS:
print(
f'Invalid split "{split_option}" requested!\nValid split_options are: ',
ALLOWED_OPTIONS,
)
return
if split_option == "scenario_only":
train_set = [
"ES2002",
"ES2005",
"ES2006",
"ES2007",
"ES2008",
"ES2009",
"ES2010",
"ES2012",
"ES2013",
"ES2015",
"ES2016",
"IS1000",
"IS1001",
"IS1002",
"IS1003",
"IS1004",
"IS1005",
"IS1006",
"IS1007",
"TS3005",
"TS3008",
"TS3009",
"TS3010",
"TS3011",
"TS3012",
]
dev_set = [
"ES2003",
"ES2011",
"IS1008",
"TS3004",
"TS3006",
]
test_set = [
"ES2004",
"ES2014",
"IS1009",
"TS3003",
"TS3007",
]
if split_option == "full_corpus":
# List of train: SA (TRAINING PART OF SEEN DATA)
train_set = [
"ES2002",
"ES2005",
"ES2006",
"ES2007",
"ES2008",
"ES2009",
"ES2010",
"ES2012",
"ES2013",
"ES2015",
"ES2016",
"IS1000",
"IS1001",
"IS1002",
"IS1003",
"IS1004",
"IS1005",
"IS1006",
"IS1007",
"TS3005",
"TS3008",
"TS3009",
"TS3010",
"TS3011",
"TS3012",
"EN2001",
"EN2003",
"EN2004",
"EN2005",
"EN2006",
"EN2009",
"IN1001",
"IN1002",
"IN1005",
"IN1007",
"IN1008",
"IN1009",
"IN1012",
"IN1013",
"IN1014",
"IN1016",
]
# List of dev: SB (DEV PART OF SEEN DATA)
dev_set = [
"ES2003",
"ES2011",
"IS1008",
"TS3004",
"TS3006",
"IB4001",
"IB4002",
"IB4003",
"IB4004",
"IB4010",
"IB4011",
]
# List of test: SC (UNSEEN DATA FOR EVALUATION)
# Note that IB4005 does not appear because it has speakers in common with two sets of data.
test_set = [
"ES2004",
"ES2014",
"IS1009",
"TS3003",
"TS3007",
"EN2002",
]
if split_option == "full_corpus_asr":
train_set = [
"ES2002",
"ES2003",
"ES2005",
"ES2006",
"ES2007",
"ES2008",
"ES2009",
"ES2010",
"ES2012",
"ES2013",
"ES2014",
"ES2015",
"ES2016",
"IS1000",
"IS1001",
"IS1002",
"IS1003",
"IS1004",
"IS1005",
"IS1006",
"IS1007",
"TS3005",
"TS3006",
"TS3007",
"TS3008",
"TS3009",
"TS3010",
"TS3011",
"TS3012",
"EN2001",
"EN2003",
"EN2004",
"EN2005",
"EN2006",
"EN2009",
"IN1001",
"IN1002",
"IN1005",
"IN1007",
"IN1008",
"IN1009",
"IN1012",
"IN1013",
"IN1014",
"IN1016",
]
dev_set0 = [
"ES2011",
"IS1008",
"TS3004",
"IB4001",
"IB4002",
"IB4003",
"IB4004",
"IB4010",
"IB4011",
]
test_set0 = [
"ES2004",
"IS1009",
"TS3003",
"EN2002",
]
dev_set1 = [
"ES2011a",
"IS1008a",
"TS3004a",
"IB4001",
"IB4002",
"IB4003",
"IB4004",
]
test_set1 = [
"ES2004a",
"IS1009a",
"TS3003a",
"EN2001a",
]
train_set = [
"IB4001",
"IB4002",
"IB4003",
"IB4004",
]
dev_set = [
"IB4002",
]
test_set = [
"IB4004",
]
return train_set, dev_set, test_set
"""
Data reading and writing.
Authors
* qingenz123@126.com (Qingen ZHAO) 2022
"""
import os
import pickle
def save_pkl(obj, file):
"""Save an object in pkl format.
Arguments
---------
obj : object
Object to save in pkl format
file : str
Path to the output file
sampling_rate : int
Sampling rate of the audio file, TODO: this is not used?
Example
-------
>>> tmpfile = os.path.join(getfixture('tmpdir'), "example.pkl")
>>> save_pkl([1, 2, 3, 4, 5], tmpfile)
>>> load_pkl(tmpfile)
[1, 2, 3, 4, 5]
"""
with open(file, "wb") as f:
pickle.dump(obj, f)
def load_pickle(pickle_path):
"""Utility function for loading .pkl pickle files.
Arguments
---------
pickle_path : str
Path to pickle file.
Returns
-------
out : object
Python object loaded from pickle.
"""
with open(pickle_path, "rb") as f:
out = pickle.load(f)
return out
def load_pkl(file):
"""Loads a pkl file.
For an example, see `save_pkl`.
Arguments
---------
file : str
Path to the input pkl file.
Returns
-------
The loaded object.
"""
# Deals with the situation where two processes are trying
# to access the same label dictionary by creating a lock
count = 100
while count > 0:
if os.path.isfile(file + ".lock"):
time.sleep(1)
count -= 1
else:
break
try:
open(file + ".lock", "w").close()
with open(file, "rb") as f:
return pickle.load(f)
finally:
if os.path.isfile(file + ".lock"):
os.remove(file + ".lock")
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册