未验证 提交 98fa5803 编写于 作者: K KP 提交者: GitHub

Add aishell and librispeech dataset (#5312)

* Add aishell and librispeech dataset

* Add aishell and librispeech dataset

* Add aishell and librispeech dataset

* Add UrbanAudioVisualScenes Dataset

* Update features api
上级 bbdb65ed
......@@ -20,7 +20,7 @@ from typing import List
import numpy as np
import paddle
from paddleaudio.backends import load as load_audio
from paddleaudio.features import mel_spect
from paddleaudio.features import melspectrogram
from paddleaudio.models.panns import cnn14
from paddleaudio.utils import logger
......@@ -59,8 +59,7 @@ def batchify(data: List[List[float]], sample_rate: int, batch_size: int,
"""
examples = []
for waveform in data:
feats = mel_spect(waveform, sample_rate=sample_rate,
**kwargs).transpose()
feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
examples.append(feats)
# Seperates data into some batches.
......
......@@ -22,7 +22,7 @@ import paddle.nn.functional as F
from model import SoundClassifier
from paddleaudio.backends import load as load_audio
from paddleaudio.datasets import ESC50
from paddleaudio.features import mel_spect
from paddleaudio.features import melspectrogram
from paddleaudio.models.panns import cnn14
# yapf: disable
......@@ -37,14 +37,16 @@ args = parser.parse_args()
def extract_features(file: str, **kwargs):
waveform, sr = load_audio(args.wav, sr=None)
feats = mel_spect(waveform, sample_rate=sr, **kwargs).transpose()
feats = melspectrogram(waveform, sr, **kwargs).transpose()
return feats
if __name__ == '__main__':
paddle.set_device(args.device)
model = SoundClassifier(backbone=cnn14(pretrained=False, extract_embedding=True), num_class=len(ESC50.label_list))
model = SoundClassifier(backbone=cnn14(pretrained=False,
extract_embedding=True),
num_class=len(ESC50.label_list))
model.set_state_dict(paddle.load(args.checkpoint))
model.eval()
......
......@@ -43,16 +43,15 @@ if __name__ == "__main__":
backbone = cnn14(pretrained=True, extract_embedding=True)
model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
optimizer = paddle.optimizer.Adam(learning_rate=args.learning_rate, parameters=model.parameters())
optimizer = paddle.optimizer.Adam(learning_rate=args.learning_rate,
parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
train_ds = ESC50(mode='train', feat_type='mel_spect')
dev_ds = ESC50(mode='dev', feat_type='mel_spect')
train_ds = ESC50(mode='train', feat_type='melspectrogram')
dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
train_sampler = paddle.io.DistributedBatchSampler(train_ds,
batch_size=args.batch_size,
shuffle=True,
drop_last=False)
train_sampler = paddle.io.DistributedBatchSampler(
train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
train_loader = paddle.io.DataLoader(
train_ds,
batch_sampler=train_sampler,
......@@ -78,7 +77,8 @@ if __name__ == "__main__":
loss = criterion(logits, labels)
loss.backward()
optimizer.step()
if isinstance(optimizer._learning_rate, paddle.optimizer.lr.LRScheduler):
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler):
optimizer._learning_rate.step()
optimizer.clear_grad()
......@@ -97,10 +97,12 @@ if __name__ == "__main__":
avg_loss /= args.log_freq
avg_acc = num_corrects / num_samples
print_msg = 'Epoch={}/{}, Step={}/{}'.format(epoch, args.epochs, batch_idx + 1, steps_per_epoch)
print_msg = 'Epoch={}/{}, Step={}/{}'.format(
epoch, args.epochs, batch_idx + 1, steps_per_epoch)
print_msg += ' loss={:.4f}'.format(avg_loss)
print_msg += ' acc={:.4f}'.format(avg_acc)
print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(lr, timer.timing, timer.eta)
print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
lr, timer.timing, timer.eta)
logger.train(print_msg)
avg_loss = 0
......@@ -108,7 +110,10 @@ if __name__ == "__main__":
num_samples = 0
if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0:
dev_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False, drop_last=False)
dev_sampler = paddle.io.BatchSampler(dev_ds,
batch_size=args.batch_size,
shuffle=False,
drop_last=False)
dev_loader = paddle.io.DataLoader(
dev_ds,
batch_sampler=dev_sampler,
......@@ -134,7 +139,10 @@ if __name__ == "__main__":
logger.eval(print_msg)
# Save model
save_dir = os.path.join(args.checkpoint_dir, 'epoch_{}'.format(epoch))
save_dir = os.path.join(args.checkpoint_dir,
'epoch_{}'.format(epoch))
logger.info('Saving model checkpoint to {}'.format(save_dir))
paddle.save(model.state_dict(), os.path.join(save_dir, 'model.pdparams'))
paddle.save(optimizer.state_dict(), os.path.join(save_dir, 'model.pdopt'))
paddle.save(model.state_dict(),
os.path.join(save_dir, 'model.pdparams'))
paddle.save(optimizer.state_dict(),
os.path.join(save_dir, 'model.pdopt'))
......@@ -12,18 +12,23 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .dcase import UrbanAcousticScenes
from .aishell import AISHELL1
from .dcase import UrbanAcousticScenes, UrbanAudioVisualScenes
from .esc50 import ESC50
from .gtzan import GTZAN
from .librispeech import LIBRISPEECH
from .ravdess import RAVDESS
from .tess import TESS
from .urban_sound import UrbanSound8K
__all__ = [
'AISHELL1',
'LIBRISPEECH',
'ESC50',
'UrbanSound8K',
'GTZAN',
'UrbanAcousticScenes',
'UrbanAudioVisualScenes',
'RAVDESS',
'TESS',
]
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import codecs
import collections
import json
import os
from typing import Dict, List, Tuple
from paddle.io import Dataset
from tqdm import tqdm
from ..backends import load as load_audio
from ..utils.download import decompress, download_and_decompress
from ..utils.env import DATA_HOME
from ..utils.log import logger
from .dataset import feat_funcs
__all__ = ['AISHELL1']
class AISHELL1(Dataset):
"""
This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
smart home, autonomous driving, and industrial production. The whole recording was
put in quiet indoor environment, using 3 different devices at the same time: high
fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
in China were invited to participate in the recording. The manual transcription
accuracy rate is above 95%, through professional speech annotation and strict
quality inspection. The corpus is divided into training, development and testing
sets.
Reference:
AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
https://arxiv.org/abs/1709.05522
"""
archieves = [
{
'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
'md5': '2f494334227864a8a8fec932999db9d8',
},
]
text_meta = os.path.join('data_aishell', 'transcript', 'aishell_transcript_v0.8.txt')
utt_info = collections.namedtuple('META_INFO', ('file_path', 'utt_id', 'text'))
audio_path = os.path.join('data_aishell', 'wav')
manifest_path = os.path.join('data_aishell', 'manifest')
subset = ['train', 'dev', 'test']
def __init__(self, subset: str = 'train', feat_type: str = 'raw', **kwargs):
assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(self.subset, subset)
self.subset = subset
self.feat_type = feat_type
self.feat_config = kwargs
self._data = self._get_data()
super(AISHELL1, self).__init__()
def _get_text_info(self) -> Dict[str, str]:
ret = {}
with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
for line in rf.readlines()[1:]:
utt_id, text = map(str.strip, line.split(' ', 1)) # utt_id, text
ret.update({utt_id: ''.join(text.split())})
return ret
def _get_data(self):
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
download_and_decompress(self.archieves, DATA_HOME)
# Extract *wav from *.tar.gz.
for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
for file in files:
if file.endswith('.tar.gz'):
decompress(os.path.join(root, file))
os.remove(os.path.join(root, file))
text_info = self._get_text_info()
data = []
for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path, self.subset)):
for file in files:
if file.endswith('.wav'):
utt_id = os.path.splitext(file)[0]
if utt_id not in text_info: # There are some utt_id that without label
continue
text = text_info[utt_id]
file_path = os.path.join(root, file)
data.append(self.utt_info(file_path, utt_id, text))
return data
def _convert_to_record(self, idx: int):
sample = self._data[idx]
record = {}
# To show all fields in a namedtuple: `type(sample)._fields`
for field in type(sample)._fields:
record[field] = getattr(sample, field)
waveform, sr = load_audio(sample[0]) # The first element of sample is file path
feat_func = feat_funcs[self.feat_type]
feat = feat_func(waveform, sample_rate=sr, **self.feat_config) if feat_func else waveform
record.update({'feat': feat, 'duration': len(waveform) / sr})
return record
def create_manifest(self, prefix='manifest'):
if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
manifest_file = os.path.join(DATA_HOME, self.manifest_path, f'{prefix}.{self.subset}')
with codecs.open(manifest_file, 'w', 'utf-8') as f:
for idx in tqdm(range(len(self))):
record = self._convert_to_record(idx)
record_line = json.dumps(
{
'utt': record['utt_id'],
'feat': record['file_path'],
'feat_shape': (record['duration'], ),
'text': record['text']
},
ensure_ascii=False)
f.write(record_line + '\n')
logger.info(f'Manifest file {manifest_file} created.')
def __getitem__(self, idx):
record = self._convert_to_record(idx)
return tuple(record.values())
def __len__(self):
return len(self._data)
......@@ -12,55 +12,50 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
from typing import List, Tuple
from typing import Dict, List, Tuple
import numpy as np
import paddle
from pathos.multiprocessing import ProcessPool
from pathos.threading import ThreadPool
from tqdm import tqdm
from ..backends import load as load_audio
from ..features import linear_spect, log_spect, mel_spect
from ..utils.log import logger
from ..features import melspectrogram, mfcc
feat_funcs = {
'raw': None,
'melspectrogram': melspectrogram,
'mfcc': mfcc,
}
class AudioClassificationDataset(paddle.io.Dataset):
"""
Base class of audio classification dataset.
"""
_feat_func = {
'raw': None,
'mel_spect': mel_spect,
'linear_spect': linear_spect,
'log_spect': log_spect,
}
def __init__(self,
files: List[str],
labels: List[int],
sample_rate: int,
duration: float,
feat_type: str = 'raw',
**kwargs):
"""
Ags:
files (:obj:`List[str]`): A list of absolute path of audio files.
labels (:obj:`List[int]`): Labels of audio files.
sample_rate (:obj:`int`): Sample rate of audio files.
duration (:obj:`float`): Duration of audio files.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
super(AudioClassificationDataset, self).__init__()
if feat_type not in self._feat_func.keys():
if feat_type not in feat_funcs.keys():
raise RuntimeError(\
f"Unknown feat_type: {feat_type}, it must be one in {list(self._feat_func.keys())}")
f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}")
self.files = files
self.labels = labels
self.sample_rate = sample_rate
self.duration = duration
self.feat_type = feat_type
self.feat_config = kwargs # Pass keyword arguments to customize feature config
......@@ -71,24 +66,19 @@ class AudioClassificationDataset(paddle.io.Dataset):
def _convert_to_record(self, idx):
file, label = self.files[idx], self.labels[idx]
waveform, _ = load_audio(file, sr=self.sample_rate)
normal_length = self.sample_rate * self.duration
if len(waveform) > normal_length:
waveform = waveform[:normal_length]
else:
waveform = np.pad(waveform, (0, normal_length - len(waveform)))
feat_func = self._feat_func[self.feat_type]
waveform, sample_rate = load_audio(file)
feat_func = feat_funcs[self.feat_type]
record = {}
record['feat'] = feat_func(waveform, sample_rate=self.sample_rate, **
record['feat'] = feat_func(waveform, sample_rate, **
self.feat_config) if feat_func else waveform
record['label'] = label
return record
def __getitem__(self, idx):
record = self._convert_to_record(idx)
return np.array(record['feat']).transpose(), np.array(record['label'], dtype=np.int64)
return np.array(record['feat']).transpose(), np.array(record['label'],
dtype=np.int64)
def __len__(self):
return len(self.files)
......@@ -20,7 +20,7 @@ from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['UrbanAcousticScenes']
__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']
class UrbanAcousticScenes(AudioClassificationDataset):
......@@ -119,8 +119,6 @@ class UrbanAcousticScenes(AudioClassificationDataset):
}
subset_meta_info = collections.namedtuple('SUBSET_META_INFO', ('filename', 'scene_label'))
audio_path = os.path.join(base_name, 'audio')
sample_rate = 44100 # 44.1 khz
duration = 10 # 10s
def __init__(self, mode: str = 'train', feat_type: str = 'raw', **kwargs):
"""
......@@ -131,12 +129,7 @@ class UrbanAcousticScenes(AudioClassificationDataset):
It identifies the feature type that user wants to extrace of an audio file.
"""
files, labels = self._get_data(mode)
super(UrbanAcousticScenes, self).__init__(files=files,
labels=labels,
sample_rate=self.sample_rate,
duration=self.duration,
feat_type=feat_type,
**kwargs)
super(UrbanAcousticScenes, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self, subset: str = None, skip_header: bool = True) -> List[collections.namedtuple]:
if subset is None:
......@@ -164,7 +157,121 @@ class UrbanAcousticScenes(AudioClassificationDataset):
files = []
labels = []
for sample in meta_info:
filename, label = sample
filename, label = sample[:2]
filename = os.path.basename(filename)
target = self.label_list.index(label)
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
labels.append(int(target))
return files, labels
class UrbanAudioVisualScenes(AudioClassificationDataset):
"""
TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
and video recordings from 12 European cities in 10 different scenes.
This dataset consists of 10-seconds audio and video segments from 10
acoustic scenes. The total amount of audio in the development set is 34 hours.
Reference:
A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
https://arxiv.org/abs/2011.00030
"""
source_url = 'https://zenodo.org/record/4477542/files/'
base_name = 'TAU-urban-audio-visual-scenes-2021-development'
archieves = [
{
'url': source_url + base_name + '.meta.zip',
'md5': '76e3d7ed5291b118372e06379cb2b490',
},
{
'url': source_url + base_name + '.audio.1.zip',
'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
},
{
'url': source_url + base_name + '.audio.2.zip',
'md5': '7fd6bb63127f5785874a55aba4e77aa5',
},
{
'url': source_url + base_name + '.audio.3.zip',
'md5': '61396bede29d7c8c89729a01a6f6b2e2',
},
{
'url': source_url + base_name + '.audio.4.zip',
'md5': '6ddac89717fcf9c92c451868eed77fe1',
},
{
'url': source_url + base_name + '.audio.5.zip',
'md5': 'af4820756cdf1a7d4bd6037dc034d384',
},
{
'url': source_url + base_name + '.audio.6.zip',
'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
},
{
'url': source_url + base_name + '.audio.7.zip',
'md5': '2be39a76aeed704d5929d020a2909efd',
},
{
'url': source_url + base_name + '.audio.8.zip',
'md5': '972d8afe0874720fc2f28086e7cb22a9',
},
]
label_list = ['airport', 'shopping_mall', 'metro_station', 'street_pedestrian', \
'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park']
meta_base_path = os.path.join(base_name, base_name + '.meta')
meta = os.path.join(meta_base_path, 'meta.csv')
meta_info = collections.namedtuple('META_INFO', ('filename_audio', 'filename_video', 'scene_label', 'identifier'))
subset_meta = {
'train': os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
'dev': os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
'test': os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
}
subset_meta_info = collections.namedtuple('SUBSET_META_INFO', ('filename_audio', 'filename_video', 'scene_label'))
audio_path = os.path.join(base_name, 'audio')
def __init__(self, mode: str = 'train', feat_type: str = 'raw', **kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
files, labels = self._get_data(mode)
super(UrbanAudioVisualScenes, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self, subset: str = None, skip_header: bool = True) -> List[collections.namedtuple]:
if subset is None:
meta_file = self.meta
meta_info = self.meta_info
else:
assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
meta_file = self.subset_meta[subset]
meta_info = self.subset_meta_info
ret = []
with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
lines = rf.readlines()[1:] if skip_header else rf.readlines()
for line in lines:
ret.append(meta_info(*line.strip().split('\t')))
return ret
def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, os.path.join(DATA_HOME, self.base_name))
meta_info = self._get_meta_info(subset=mode, skip_header=True)
files = []
labels = []
for sample in meta_info:
filename, _, label = sample[:3]
filename = os.path.basename(filename)
target = self.label_list.index(label)
......
......@@ -102,8 +102,6 @@ class ESC50(AudioClassificationDataset):
meta_info = collections.namedtuple('META_INFO',
('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
audio_path = os.path.join('ESC-50-master', 'audio')
sample_rate = 44100 # 44.1 khz
duration = 5 # 5s
def __init__(self, mode: str = 'train', split: int = 1, feat_type: str = 'raw', **kwargs):
"""
......@@ -116,12 +114,7 @@ class ESC50(AudioClassificationDataset):
It identifies the feature type that user wants to extrace of an audio file.
"""
files, labels = self._get_data(mode, split)
super(ESC50, self).__init__(files=files,
labels=labels,
sample_rate=self.sample_rate,
duration=self.duration,
feat_type=feat_type,
**kwargs)
super(ESC50, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self) -> List[collections.namedtuple]:
ret = []
......
......@@ -45,8 +45,6 @@ class GTZAN(AudioClassificationDataset):
meta = os.path.join('genres', 'input.mf')
meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
audio_path = 'genres'
sample_rate = 22050
duration = 30
def __init__(self, mode='train', seed=0, n_folds=5, split=1, feat_type='raw', **kwargs):
"""
......@@ -64,12 +62,7 @@ class GTZAN(AudioClassificationDataset):
"""
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
files, labels = self._get_data(mode, seed, n_folds, split)
super(GTZAN, self).__init__(files=files,
labels=labels,
sample_rate=self.sample_rate,
duration=self.duration,
feat_type=feat_type,
**kwargs)
super(GTZAN, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self) -> List[collections.namedtuple]:
ret = []
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import codecs
import collections
import json
import os
from typing import Dict, List, Tuple
from paddle.io import Dataset
from tqdm import tqdm
from ..backends import load as load_audio
from ..utils.download import decompress, download_and_decompress
from ..utils.env import DATA_HOME
from ..utils.log import logger
from .dataset import feat_funcs
__all__ = ['LIBRISPEECH']
class LIBRISPEECH(Dataset):
"""
LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
derived from read audiobooks from the LibriVox project, and has been carefully
segmented and aligned.
Reference:
LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
https://arxiv.org/abs/1709.05522
"""
source_url = 'http://www.openslr.org/resources/12/'
archieves = [
{
'url': source_url + 'train-clean-100.tar.gz',
'md5': '2a93770f6d5c6c964bc36631d331a522',
},
{
'url': source_url + 'train-clean-360.tar.gz',
'md5': 'c0e676e450a7ff2f54aeade5171606fa',
},
{
'url': source_url + 'train-other-500.tar.gz',
'md5': 'd1a0fd59409feb2c614ce4d30c387708',
},
{
'url': source_url + 'dev-clean.tar.gz',
'md5': '42e2234ba48799c1f50f24a7926300a1',
},
{
'url': source_url + 'dev-other.tar.gz',
'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
},
{
'url': source_url + 'test-clean.tar.gz',
'md5': '32fa31d27d2e1cad72775fee3f4849a9',
},
{
'url': source_url + 'test-other.tar.gz',
'md5': 'fb5a50374b501bb3bac4815ee91d3135',
},
]
speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
utt_info = collections.namedtuple('META_INFO', ('file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
audio_path = 'LibriSpeech'
manifest_path = os.path.join('LibriSpeech', 'manifest')
subset = ['train-clean-100', 'train-clean-360', 'train-clean-500', \
'dev-clean', 'dev-other', 'test-clean', 'test-other']
def __init__(self, subset: str = 'train-clean-100', feat_type: str = 'raw', **kwargs):
assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(self.subset, subset)
self.subset = subset
self.feat_type = feat_type
self.feat_config = kwargs
self._data = self._get_data()
super(LIBRISPEECH, self).__init__()
def _get_speaker_info(self) -> Dict[str, str]:
ret = {}
with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
for line in rf.readlines():
if ';' in line: # Skip dataset abstract
continue
spk_id, gender = map(str.strip, line.split('|')[:2]) # spk_id, gender
ret.update({spk_id: gender})
return ret
def _get_text_info(self, trans_file) -> Dict[str, str]:
ret = {}
with open(trans_file, 'r') as rf:
for line in rf.readlines():
utt_id, text = map(str.strip, line.split(' ', 1)) # utt_id, text
ret.update({utt_id: text})
return ret
def _get_data(self):
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
download_and_decompress(self.archieves, DATA_HOME, len(self.archieves))
# Speaker info
speaker_info = self._get_speaker_info()
# Text info
text_info = {}
for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path, self.subset)):
for file in files:
if file.endswith('.trans.txt'):
text_info.update(self._get_text_info(os.path.join(root, file)))
data = []
for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path, self.subset)):
for file in files:
if file.endswith('.flac'):
utt_id = os.path.splitext(file)[0]
spk_id = utt_id.split('-')[0]
if utt_id not in text_info \
or spk_id not in speaker_info : # Skip samples with incomplete data
continue
file_path = os.path.join(root, file)
text = text_info[utt_id]
spk_gender = speaker_info[spk_id]
data.append(self.utt_info(file_path, utt_id, text, spk_id, spk_gender))
return data
def _convert_to_record(self, idx: int):
sample = self._data[idx]
record = {}
# To show all fields in a namedtuple: `type(sample)._fields`
for field in type(sample)._fields:
record[field] = getattr(sample, field)
waveform, sr = load_audio(sample[0]) # The first element of sample is file path
feat_func = feat_funcs[self.feat_type]
feat = feat_func(waveform, sample_rate=sr, **self.feat_config) if feat_func else waveform
record.update({'feat': feat, 'duration': len(waveform) / sr})
return record
def create_manifest(self, prefix='manifest'):
if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
manifest_file = os.path.join(DATA_HOME, self.manifest_path, f'{prefix}.{self.subset}')
with codecs.open(manifest_file, 'w', 'utf-8') as f:
for idx in tqdm(range(len(self))):
record = self._convert_to_record(idx)
record_line = json.dumps(
{
'utt': record['utt_id'],
'feat': record['file_path'],
'feat_shape': (record['duration'], ),
'text': record['text'],
'spk': record['spk_id'],
'gender': record['spk_gender'],
},
ensure_ascii=False)
f.write(record_line + '\n')
logger.info(f'Manifest file {manifest_file} created.')
def __getitem__(self, idx):
record = self._convert_to_record(idx)
return tuple(record.values())
def __len__(self):
return len(self._data)
......@@ -40,23 +40,33 @@ class RAVDESS(AudioClassificationDataset):
archieves = [
{
'url': 'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
'url':
'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
'md5': '5411230427d67a21e18aa4d466e6d1b9',
},
{
'url': 'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
'url':
'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
'md5': 'bc696df654c87fed845eb13823edef8a',
},
]
label_list = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
label_list = [
'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
'surprised'
]
meta_info = collections.namedtuple(
'META_INFO', ('modality', 'vocal_channel', 'emotion', 'emotion_intensity', 'statement', 'repitition', 'actor'))
'META_INFO', ('modality', 'vocal_channel', 'emotion',
'emotion_intensity', 'statement', 'repitition', 'actor'))
speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
sample_rate = 44100 # 44.1 khz
duration = 5 # 5s
def __init__(self, mode='train', seed=0, n_folds=5, split=1, feat_type='raw', **kwargs):
def __init__(self,
mode='train',
seed=0,
n_folds=5,
split=1,
feat_type='raw',
**kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
......@@ -74,8 +84,6 @@ class RAVDESS(AudioClassificationDataset):
files, labels = self._get_data(mode, seed, n_folds, split)
super(RAVDESS, self).__init__(files=files,
labels=labels,
sample_rate=self.sample_rate,
duration=self.duration,
feat_type=feat_type,
**kwargs)
......@@ -86,8 +94,10 @@ class RAVDESS(AudioClassificationDataset):
ret.append(self.meta_info(*basename_without_extend.split('-')))
return ret
def _get_data(self, mode, seed, n_folds, split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(self.speech_path) and not os.path.isdir(self.song_path):
def _get_data(self, mode, seed, n_folds,
split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(self.speech_path) and not os.path.isdir(
self.song_path):
download_and_decompress(self.archieves, DATA_HOME)
wav_files = []
......@@ -102,7 +112,9 @@ class RAVDESS(AudioClassificationDataset):
wav_files.append(os.path.join(root, file))
random.seed(seed) # shuffle samples to split data
random.shuffle(wav_files) # make sure using the same seed to create train and dev dataset
random.shuffle(
wav_files
) # make sure using the same seed to create train and dev dataset
meta_info = self._get_meta_info(wav_files)
files = []
......@@ -122,8 +134,3 @@ class RAVDESS(AudioClassificationDataset):
labels.append(target)
return files, labels
if __name__ == "__main__":
train_ds = RAVDESS(mode='train', feat_type='mel_spect')
dev_ds = RAVDESS(mode='dev', feat_type='mel_spect')
......@@ -55,8 +55,6 @@ class TESS(AudioClassificationDataset):
]
meta_info = collections.namedtuple('META_INFO', ('speaker', 'word', 'emotion'))
audio_path = 'TESS_Toronto_emotional_speech_set'
sample_rate = 24414
duration = 2
def __init__(self, mode='train', seed=0, n_folds=5, split=1, feat_type='raw', **kwargs):
"""
......@@ -74,12 +72,7 @@ class TESS(AudioClassificationDataset):
"""
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
files, labels = self._get_data(mode, seed, n_folds, split)
super(TESS, self).__init__(files=files,
labels=labels,
sample_rate=self.sample_rate,
duration=self.duration,
feat_type=feat_type,
**kwargs)
super(TESS, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self, files) -> List[collections.namedtuple]:
ret = []
......
......@@ -47,17 +47,10 @@ class UrbanSound8K(AudioClassificationDataset):
meta_info = collections.namedtuple('META_INFO',
('filename', 'fsid', 'start', 'end', 'salience', 'fold', 'class_id', 'label'))
audio_path = os.path.join('UrbanSound8K', 'audio')
sample_rate = 48000 # 48 khz
duration = 4 # 4s
def __init__(self, mode: str = 'train', split: int = 1, feat_type: str = 'raw', **kwargs):
files, labels = self._get_data(mode, split)
super(UrbanSound8K, self).__init__(files=files,
labels=labels,
sample_rate=self.sample_rate,
duration=self.duration,
feat_type=feat_type,
**kwargs)
super(UrbanSound8K, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs)
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
......
......@@ -17,24 +17,40 @@ from typing import Dict, List
from paddle.framework import load as load_state_dict
from paddle.utils import download
from pathos.multiprocessing import ProcessPool
from .log import logger
download.logger = logger
def download_and_decompress(archives: List[Dict[str, str]], path: str):
def decompress(file: str):
"""
Extracts all files from a compressed file.
"""
assert os.path.isfile(file), "File: {} not exists.".format(file)
download._decompress(file)
def download_and_decompress(archives: List[Dict[str, str]], path: str, n_workers: int = 0):
"""
Download archieves and decompress to specific path.
"""
if not os.path.isdir(path):
os.makedirs(path)
for archive in archives:
assert 'url' in archive and 'md5' in archive, \
'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
if n_workers <= 0:
for archive in archives:
assert 'url' in archive and 'md5' in archive, \
'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
download.get_path_from_url(archive['url'], path, archive['md5'])
download.get_path_from_url(archive['url'], path, archive['md5'])
else:
pool = ProcessPool(nodes=n_workers)
pool.imap(download.get_path_from_url, [_['url'] for _ in archives], [path] * len(archives),
[_['md5'] for _ in archives])
pool.close()
pool.join()
def load_state_dict_from_url(url: str, path: str, md5: str = None):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册