Add aishell and librispeech dataset (#5312)

* Add aishell and librispeech dataset * Add aishell and librispeech dataset * Add aishell and librispeech dataset * Add UrbanAudioVisualScenes Dataset * Update features api

Add aishell and librispeech dataset (#5312)
* Add aishell and librispeech dataset * Add aishell and librispeech dataset * Add aishell and librispeech dataset * Add UrbanAudioVisualScenes Dataset * Update features api
98fa5803 · KP · GitHub · bbdb65ed · 98fa5803 · 98fa5803
14 changed file
--- a/PaddleAudio/examples/panns/audio_tag.py
+++ b/PaddleAudio/examples/panns/audio_tag.py
@@ -20,7 +20,7 @@ from typing import List
 import numpy as np
 import paddle
 from paddleaudio.backends import load as load_audio
-from paddleaudio.features import mel_spect
+from paddleaudio.features import melspectrogram
 from paddleaudio.models.panns import cnn14
 from paddleaudio.utils import logger

@@ -59,8 +59,7 @@ def batchify(data: List[List[float]], sample_rate: int, batch_size: int,
    """
    examples = []
    for waveform in data:
-        feats = mel_spect(waveform, sample_rate=sample_rate,
-                          **kwargs).transpose()
+        feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
        examples.append(feats)

    # Seperates data into some batches.

--- a/PaddleAudio/examples/sound_classification/predict.py
+++ b/PaddleAudio/examples/sound_classification/predict.py
@@ -22,7 +22,7 @@ import paddle.nn.functional as F
 from model import SoundClassifier
 from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets import ESC50
-from paddleaudio.features import mel_spect
+from paddleaudio.features import melspectrogram
 from paddleaudio.models.panns import cnn14

 # yapf: disable
@@ -37,14 +37,16 @@ args = parser.parse_args()

 def extract_features(file: str, **kwargs):
    waveform, sr = load_audio(args.wav, sr=None)
-    feats = mel_spect(waveform, sample_rate=sr, **kwargs).transpose()
+    feats = melspectrogram(waveform, sr, **kwargs).transpose()
    return feats


 if __name__ == '__main__':
    paddle.set_device(args.device)

-    model = SoundClassifier(backbone=cnn14(pretrained=False, extract_embedding=True), num_class=len(ESC50.label_list))
+    model = SoundClassifier(backbone=cnn14(pretrained=False,
+                                           extract_embedding=True),
+                            num_class=len(ESC50.label_list))
    model.set_state_dict(paddle.load(args.checkpoint))
    model.eval()


--- a/PaddleAudio/examples/sound_classification/train.py
+++ b/PaddleAudio/examples/sound_classification/train.py
@@ -43,16 +43,15 @@ if __name__ == "__main__":

    backbone = cnn14(pretrained=True, extract_embedding=True)
    model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
-    optimizer = paddle.optimizer.Adam(learning_rate=args.learning_rate, parameters=model.parameters())
+    optimizer = paddle.optimizer.Adam(learning_rate=args.learning_rate,
+                                      parameters=model.parameters())
    criterion = paddle.nn.loss.CrossEntropyLoss()

-    train_ds = ESC50(mode='train', feat_type='mel_spect')
-    dev_ds = ESC50(mode='dev', feat_type='mel_spect')
+    train_ds = ESC50(mode='train', feat_type='melspectrogram')
+    dev_ds = ESC50(mode='dev', feat_type='melspectrogram')

-    train_sampler = paddle.io.DistributedBatchSampler(train_ds,
-                                                      batch_size=args.batch_size,
-                                                      shuffle=True,
-                                                      drop_last=False)
+    train_sampler = paddle.io.DistributedBatchSampler(
+        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
    train_loader = paddle.io.DataLoader(
        train_ds,
        batch_sampler=train_sampler,
@@ -78,7 +77,8 @@ if __name__ == "__main__":
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
-            if isinstance(optimizer._learning_rate, paddle.optimizer.lr.LRScheduler):
+            if isinstance(optimizer._learning_rate,
+                          paddle.optimizer.lr.LRScheduler):
                optimizer._learning_rate.step()
            optimizer.clear_grad()

@@ -97,10 +97,12 @@ if __name__ == "__main__":
                avg_loss /= args.log_freq
                avg_acc = num_corrects / num_samples

-                print_msg = 'Epoch={}/{}, Step={}/{}'.format(epoch, args.epochs, batch_idx + 1, steps_per_epoch)
+                print_msg = 'Epoch={}/{}, Step={}/{}'.format(
+                    epoch, args.epochs, batch_idx + 1, steps_per_epoch)
                print_msg += ' loss={:.4f}'.format(avg_loss)
                print_msg += ' acc={:.4f}'.format(avg_acc)
-                print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(lr, timer.timing, timer.eta)
+                print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
+                    lr, timer.timing, timer.eta)
                logger.train(print_msg)

                avg_loss = 0
@@ -108,7 +110,10 @@ if __name__ == "__main__":
                num_samples = 0

        if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0:
-            dev_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False, drop_last=False)
+            dev_sampler = paddle.io.BatchSampler(dev_ds,
+                                                 batch_size=args.batch_size,
+                                                 shuffle=False,
+                                                 drop_last=False)
            dev_loader = paddle.io.DataLoader(
                dev_ds,
                batch_sampler=dev_sampler,
@@ -134,7 +139,10 @@ if __name__ == "__main__":
            logger.eval(print_msg)

            # Save model
-            save_dir = os.path.join(args.checkpoint_dir, 'epoch_{}'.format(epoch))
+            save_dir = os.path.join(args.checkpoint_dir,
+                                    'epoch_{}'.format(epoch))
            logger.info('Saving model checkpoint to {}'.format(save_dir))
-            paddle.save(model.state_dict(), os.path.join(save_dir, 'model.pdparams'))
-            paddle.save(optimizer.state_dict(), os.path.join(save_dir, 'model.pdopt'))
+            paddle.save(model.state_dict(),
+                        os.path.join(save_dir, 'model.pdparams'))
+            paddle.save(optimizer.state_dict(),
+                        os.path.join(save_dir, 'model.pdopt'))
--- a/PaddleAudio/paddleaudio/datasets/__init__.py
+++ b/PaddleAudio/paddleaudio/datasets/__init__.py
@@ -12,18 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from .dcase import UrbanAcousticScenes
+from .aishell import AISHELL1
+from .dcase import UrbanAcousticScenes, UrbanAudioVisualScenes
 from .esc50 import ESC50
 from .gtzan import GTZAN
+from .librispeech import LIBRISPEECH
 from .ravdess import RAVDESS
 from .tess import TESS
 from .urban_sound import UrbanSound8K

 __all__ = [
+    'AISHELL1',
+    'LIBRISPEECH',
    'ESC50',
    'UrbanSound8K',
    'GTZAN',
    'UrbanAcousticScenes',
+    'UrbanAudioVisualScenes',
    'RAVDESS',
    'TESS',
 ]
--- a/PaddleAudio/paddleaudio/datasets/aishell.py
+++ b/PaddleAudio/paddleaudio/datasets/aishell.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import codecs
+import collections
+import json
+import os
+from typing import Dict, List, Tuple
+
+from paddle.io import Dataset
+from tqdm import tqdm
+
+from ..backends import load as load_audio
+from ..utils.download import decompress, download_and_decompress
+from ..utils.env import DATA_HOME
+from ..utils.log import logger
+from .dataset import feat_funcs
+
+__all__ = ['AISHELL1']
+
+
+class AISHELL1(Dataset):
+    """
+    This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
+    It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
+    smart home, autonomous driving, and industrial production. The whole recording was
+    put in quiet indoor environment, using 3 different devices at the same time: high
+    fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
+    iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
+    to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
+    in China were invited to participate in the recording. The manual transcription
+    accuracy rate is above 95%, through professional speech annotation and strict
+    quality inspection. The corpus is divided into training, development and testing
+    sets.
+
+    Reference:
+        AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
+        https://arxiv.org/abs/1709.05522
+    """
+
+    archieves = [
+        {
+            'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
+            'md5': '2f494334227864a8a8fec932999db9d8',
+        },
+    ]
+    text_meta = os.path.join('data_aishell', 'transcript', 'aishell_transcript_v0.8.txt')
+    utt_info = collections.namedtuple('META_INFO', ('file_path', 'utt_id', 'text'))
+    audio_path = os.path.join('data_aishell', 'wav')
+    manifest_path = os.path.join('data_aishell', 'manifest')
+    subset = ['train', 'dev', 'test']
+
+    def __init__(self, subset: str = 'train', feat_type: str = 'raw', **kwargs):
+        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(self.subset, subset)
+        self.subset = subset
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self._data = self._get_data()
+        super(AISHELL1, self).__init__()
+
+    def _get_text_info(self) -> Dict[str, str]:
+        ret = {}
+        with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                utt_id, text = map(str.strip, line.split(' ', 1))  # utt_id, text
+                ret.update({utt_id: ''.join(text.split())})
+        return ret
+
+    def _get_data(self):
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+            # Extract *wav from *.tar.gz.
+            for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
+                for file in files:
+                    if file.endswith('.tar.gz'):
+                        decompress(os.path.join(root, file))
+                        os.remove(os.path.join(root, file))
+
+        text_info = self._get_text_info()
+
+        data = []
+        for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path, self.subset)):
+            for file in files:
+                if file.endswith('.wav'):
+                    utt_id = os.path.splitext(file)[0]
+                    if utt_id not in text_info:  # There are some utt_id that without label
+                        continue
+                    text = text_info[utt_id]
+                    file_path = os.path.join(root, file)
+                    data.append(self.utt_info(file_path, utt_id, text))
+
+        return data
+
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+
+        waveform, sr = load_audio(sample[0])  # The first element of sample is file path
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(waveform, sample_rate=sr, **self.feat_config) if feat_func else waveform
+        record.update({'feat': feat, 'duration': len(waveform) / sr})
+        return record
+
+    def create_manifest(self, prefix='manifest'):
+        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
+            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
+
+        manifest_file = os.path.join(DATA_HOME, self.manifest_path, f'{prefix}.{self.subset}')
+        with codecs.open(manifest_file, 'w', 'utf-8') as f:
+            for idx in tqdm(range(len(self))):
+                record = self._convert_to_record(idx)
+                record_line = json.dumps(
+                    {
+                        'utt': record['utt_id'],
+                        'feat': record['file_path'],
+                        'feat_shape': (record['duration'], ),
+                        'text': record['text']
+                    },
+                    ensure_ascii=False)
+                f.write(record_line + '\n')
+        logger.info(f'Manifest file {manifest_file} created.')
+
+    def __getitem__(self, idx):
+        record = self._convert_to_record(idx)
+        return tuple(record.values())
+
+    def __len__(self):
+        return len(self._data)
--- a/PaddleAudio/paddleaudio/datasets/dataset.py
+++ b/PaddleAudio/paddleaudio/datasets/dataset.py
@@ -12,55 +12,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import collections
 import os
-from typing import List, Tuple
+from typing import Dict, List, Tuple

 import numpy as np
 import paddle
+from pathos.multiprocessing import ProcessPool
+from pathos.threading import ThreadPool
 from tqdm import tqdm

 from ..backends import load as load_audio
-from ..features import linear_spect, log_spect, mel_spect
-from ..utils.log import logger
+from ..features import melspectrogram, mfcc
+
+feat_funcs = {
+    'raw': None,
+    'melspectrogram': melspectrogram,
+    'mfcc': mfcc,
+}


 class AudioClassificationDataset(paddle.io.Dataset):
    """
    Base class of audio classification dataset.
    """
-    _feat_func = {
-        'raw': None,
-        'mel_spect': mel_spect,
-        'linear_spect': linear_spect,
-        'log_spect': log_spect,
-    }
-
    def __init__(self,
                 files: List[str],
                 labels: List[int],
-                 sample_rate: int,
-                 duration: float,
                 feat_type: str = 'raw',
                 **kwargs):
        """
        Ags:
            files (:obj:`List[str]`): A list of absolute path of audio files.
            labels (:obj:`List[int]`): Labels of audio files.
-            sample_rate (:obj:`int`): Sample rate of audio files.
-            duration (:obj:`float`): Duration of audio files.
            feat_type (:obj:`str`, `optional`, defaults to `raw`):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        super(AudioClassificationDataset, self).__init__()

-        if feat_type not in self._feat_func.keys():
+        if feat_type not in feat_funcs.keys():
            raise RuntimeError(\
-                f"Unknown feat_type: {feat_type}, it must be one in {list(self._feat_func.keys())}")
+                f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}")

        self.files = files
        self.labels = labels
-        self.sample_rate = sample_rate
-        self.duration = duration

        self.feat_type = feat_type
        self.feat_config = kwargs  # Pass keyword arguments to customize feature config
@@ -71,24 +66,19 @@ class AudioClassificationDataset(paddle.io.Dataset):
    def _convert_to_record(self, idx):
        file, label = self.files[idx], self.labels[idx]

-        waveform, _ = load_audio(file, sr=self.sample_rate)
-        normal_length = self.sample_rate * self.duration
-        if len(waveform) > normal_length:
-            waveform = waveform[:normal_length]
-        else:
-            waveform = np.pad(waveform, (0, normal_length - len(waveform)))
-
-        feat_func = self._feat_func[self.feat_type]
+        waveform, sample_rate = load_audio(file)
+        feat_func = feat_funcs[self.feat_type]

        record = {}
-        record['feat'] = feat_func(waveform, sample_rate=self.sample_rate, **
+        record['feat'] = feat_func(waveform, sample_rate, **
                                   self.feat_config) if feat_func else waveform
        record['label'] = label
        return record

    def __getitem__(self, idx):
        record = self._convert_to_record(idx)
-        return np.array(record['feat']).transpose(), np.array(record['label'], dtype=np.int64)
+        return np.array(record['feat']).transpose(), np.array(record['label'],
+                                                              dtype=np.int64)

    def __len__(self):
        return len(self.files)
--- a/PaddleAudio/paddleaudio/datasets/dcase.py
+++ b/PaddleAudio/paddleaudio/datasets/dcase.py
@@ -20,7 +20,7 @@ from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset

-__all__ = ['UrbanAcousticScenes']
+__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']


 class UrbanAcousticScenes(AudioClassificationDataset):
@@ -119,8 +119,6 @@ class UrbanAcousticScenes(AudioClassificationDataset):
    }
    subset_meta_info = collections.namedtuple('SUBSET_META_INFO', ('filename', 'scene_label'))
    audio_path = os.path.join(base_name, 'audio')
-    sample_rate = 44100  # 44.1 khz
-    duration = 10  # 10s

    def __init__(self, mode: str = 'train', feat_type: str = 'raw', **kwargs):
        """
@@ -131,12 +129,7 @@ class UrbanAcousticScenes(AudioClassificationDataset):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        files, labels = self._get_data(mode)
-        super(UrbanAcousticScenes, self).__init__(files=files,
-                                                  labels=labels,
-                                                  sample_rate=self.sample_rate,
-                                                  duration=self.duration,
-                                                  feat_type=feat_type,
-                                                  **kwargs)
+        super(UrbanAcousticScenes, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs)

    def _get_meta_info(self, subset: str = None, skip_header: bool = True) -> List[collections.namedtuple]:
        if subset is None:
@@ -164,7 +157,121 @@ class UrbanAcousticScenes(AudioClassificationDataset):
        files = []
        labels = []
        for sample in meta_info:
-            filename, label = sample
+            filename, label = sample[:2]
+            filename = os.path.basename(filename)
+            target = self.label_list.index(label)
+
+            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+            labels.append(int(target))
+
+        return files, labels
+
+
+class UrbanAudioVisualScenes(AudioClassificationDataset):
+    """
+    TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
+    and video recordings from 12 European cities in 10 different scenes.
+    This dataset consists of 10-seconds audio and video segments from 10
+    acoustic scenes. The total amount of audio in the development set is 34 hours.
+
+    Reference:
+        A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
+        https://arxiv.org/abs/2011.00030
+    """
+
+    source_url = 'https://zenodo.org/record/4477542/files/'
+    base_name = 'TAU-urban-audio-visual-scenes-2021-development'
+
+    archieves = [
+        {
+            'url': source_url + base_name + '.meta.zip',
+            'md5': '76e3d7ed5291b118372e06379cb2b490',
+        },
+        {
+            'url': source_url + base_name + '.audio.1.zip',
+            'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
+        },
+        {
+            'url': source_url + base_name + '.audio.2.zip',
+            'md5': '7fd6bb63127f5785874a55aba4e77aa5',
+        },
+        {
+            'url': source_url + base_name + '.audio.3.zip',
+            'md5': '61396bede29d7c8c89729a01a6f6b2e2',
+        },
+        {
+            'url': source_url + base_name + '.audio.4.zip',
+            'md5': '6ddac89717fcf9c92c451868eed77fe1',
+        },
+        {
+            'url': source_url + base_name + '.audio.5.zip',
+            'md5': 'af4820756cdf1a7d4bd6037dc034d384',
+        },
+        {
+            'url': source_url + base_name + '.audio.6.zip',
+            'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
+        },
+        {
+            'url': source_url + base_name + '.audio.7.zip',
+            'md5': '2be39a76aeed704d5929d020a2909efd',
+        },
+        {
+            'url': source_url + base_name + '.audio.8.zip',
+            'md5': '972d8afe0874720fc2f28086e7cb22a9',
+        },
+    ]
+    label_list = ['airport', 'shopping_mall', 'metro_station', 'street_pedestrian', \
+        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park']
+
+    meta_base_path = os.path.join(base_name, base_name + '.meta')
+    meta = os.path.join(meta_base_path, 'meta.csv')
+    meta_info = collections.namedtuple('META_INFO', ('filename_audio', 'filename_video', 'scene_label', 'identifier'))
+    subset_meta = {
+        'train': os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
+        'dev': os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
+        'test': os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
+    }
+    subset_meta_info = collections.namedtuple('SUBSET_META_INFO', ('filename_audio', 'filename_video', 'scene_label'))
+    audio_path = os.path.join(base_name, 'audio')
+
+    def __init__(self, mode: str = 'train', feat_type: str = 'raw', **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        files, labels = self._get_data(mode)
+        super(UrbanAudioVisualScenes, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self, subset: str = None, skip_header: bool = True) -> List[collections.namedtuple]:
+        if subset is None:
+            meta_file = self.meta
+            meta_info = self.meta_info
+        else:
+            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
+            meta_file = self.subset_meta[subset]
+            meta_info = self.subset_meta_info
+
+        ret = []
+        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
+            lines = rf.readlines()[1:] if skip_header else rf.readlines()
+            for line in lines:
+                ret.append(meta_info(*line.strip().split('\t')))
+        return ret
+
+    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, os.path.join(DATA_HOME, self.base_name))
+
+        meta_info = self._get_meta_info(subset=mode, skip_header=True)
+
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, _, label = sample[:3]
            filename = os.path.basename(filename)
            target = self.label_list.index(label)


--- a/PaddleAudio/paddleaudio/datasets/esc50.py
+++ b/PaddleAudio/paddleaudio/datasets/esc50.py
@@ -102,8 +102,6 @@ class ESC50(AudioClassificationDataset):
    meta_info = collections.namedtuple('META_INFO',
                                       ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
    audio_path = os.path.join('ESC-50-master', 'audio')
-    sample_rate = 44100  # 44.1 khz
-    duration = 5  # 5s

    def __init__(self, mode: str = 'train', split: int = 1, feat_type: str = 'raw', **kwargs):
        """
@@ -116,12 +114,7 @@ class ESC50(AudioClassificationDataset):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        files, labels = self._get_data(mode, split)
-        super(ESC50, self).__init__(files=files,
-                                    labels=labels,
-                                    sample_rate=self.sample_rate,
-                                    duration=self.duration,
-                                    feat_type=feat_type,
-                                    **kwargs)
+        super(ESC50, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs)

    def _get_meta_info(self) -> List[collections.namedtuple]:
        ret = []

--- a/PaddleAudio/paddleaudio/datasets/gtzan.py
+++ b/PaddleAudio/paddleaudio/datasets/gtzan.py
@@ -45,8 +45,6 @@ class GTZAN(AudioClassificationDataset):
    meta = os.path.join('genres', 'input.mf')
    meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
    audio_path = 'genres'
-    sample_rate = 22050
-    duration = 30

    def __init__(self, mode='train', seed=0, n_folds=5, split=1, feat_type='raw', **kwargs):
        """
@@ -64,12 +62,7 @@ class GTZAN(AudioClassificationDataset):
        """
        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
        files, labels = self._get_data(mode, seed, n_folds, split)
-        super(GTZAN, self).__init__(files=files,
-                                    labels=labels,
-                                    sample_rate=self.sample_rate,
-                                    duration=self.duration,
-                                    feat_type=feat_type,
-                                    **kwargs)
+        super(GTZAN, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs)

    def _get_meta_info(self) -> List[collections.namedtuple]:
        ret = []

--- a/PaddleAudio/paddleaudio/datasets/librispeech.py
+++ b/PaddleAudio/paddleaudio/datasets/librispeech.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import codecs
+import collections
+import json
+import os
+from typing import Dict, List, Tuple
+
+from paddle.io import Dataset
+from tqdm import tqdm
+
+from ..backends import load as load_audio
+from ..utils.download import decompress, download_and_decompress
+from ..utils.env import DATA_HOME
+from ..utils.log import logger
+from .dataset import feat_funcs
+
+__all__ = ['LIBRISPEECH']
+
+
+class LIBRISPEECH(Dataset):
+    """
+    LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
+    prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
+    derived from read audiobooks from the LibriVox project, and has been carefully
+    segmented and aligned.
+
+    Reference:
+        LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
+        http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
+        https://arxiv.org/abs/1709.05522
+    """
+
+    source_url = 'http://www.openslr.org/resources/12/'
+    archieves = [
+        {
+            'url': source_url + 'train-clean-100.tar.gz',
+            'md5': '2a93770f6d5c6c964bc36631d331a522',
+        },
+        {
+            'url': source_url + 'train-clean-360.tar.gz',
+            'md5': 'c0e676e450a7ff2f54aeade5171606fa',
+        },
+        {
+            'url': source_url + 'train-other-500.tar.gz',
+            'md5': 'd1a0fd59409feb2c614ce4d30c387708',
+        },
+        {
+            'url': source_url + 'dev-clean.tar.gz',
+            'md5': '42e2234ba48799c1f50f24a7926300a1',
+        },
+        {
+            'url': source_url + 'dev-other.tar.gz',
+            'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
+        },
+        {
+            'url': source_url + 'test-clean.tar.gz',
+            'md5': '32fa31d27d2e1cad72775fee3f4849a9',
+        },
+        {
+            'url': source_url + 'test-other.tar.gz',
+            'md5': 'fb5a50374b501bb3bac4815ee91d3135',
+        },
+    ]
+    speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
+    utt_info = collections.namedtuple('META_INFO', ('file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
+    audio_path = 'LibriSpeech'
+    manifest_path = os.path.join('LibriSpeech', 'manifest')
+    subset = ['train-clean-100', 'train-clean-360', 'train-clean-500', \
+            'dev-clean', 'dev-other', 'test-clean', 'test-other']
+
+    def __init__(self, subset: str = 'train-clean-100', feat_type: str = 'raw', **kwargs):
+        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(self.subset, subset)
+        self.subset = subset
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self._data = self._get_data()
+        super(LIBRISPEECH, self).__init__()
+
+    def _get_speaker_info(self) -> Dict[str, str]:
+        ret = {}
+        with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
+            for line in rf.readlines():
+                if ';' in line:  # Skip dataset abstract
+                    continue
+                spk_id, gender = map(str.strip, line.split('|')[:2])  # spk_id, gender
+                ret.update({spk_id: gender})
+        return ret
+
+    def _get_text_info(self, trans_file) -> Dict[str, str]:
+        ret = {}
+        with open(trans_file, 'r') as rf:
+            for line in rf.readlines():
+                utt_id, text = map(str.strip, line.split(' ', 1))  # utt_id, text
+                ret.update({utt_id: text})
+        return ret
+
+    def _get_data(self):
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
+            download_and_decompress(self.archieves, DATA_HOME, len(self.archieves))
+
+        # Speaker info
+        speaker_info = self._get_speaker_info()
+
+        # Text info
+        text_info = {}
+        for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path, self.subset)):
+            for file in files:
+                if file.endswith('.trans.txt'):
+                    text_info.update(self._get_text_info(os.path.join(root, file)))
+
+        data = []
+        for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path, self.subset)):
+            for file in files:
+                if file.endswith('.flac'):
+                    utt_id = os.path.splitext(file)[0]
+                    spk_id = utt_id.split('-')[0]
+                    if utt_id not in text_info \
+                        or spk_id not in speaker_info :  # Skip samples with incomplete data
+                        continue
+                    file_path = os.path.join(root, file)
+                    text = text_info[utt_id]
+                    spk_gender = speaker_info[spk_id]
+                    data.append(self.utt_info(file_path, utt_id, text, spk_id, spk_gender))
+
+        return data
+
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+
+        waveform, sr = load_audio(sample[0])  # The first element of sample is file path
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(waveform, sample_rate=sr, **self.feat_config) if feat_func else waveform
+        record.update({'feat': feat, 'duration': len(waveform) / sr})
+        return record
+
+    def create_manifest(self, prefix='manifest'):
+        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
+            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
+
+        manifest_file = os.path.join(DATA_HOME, self.manifest_path, f'{prefix}.{self.subset}')
+        with codecs.open(manifest_file, 'w', 'utf-8') as f:
+            for idx in tqdm(range(len(self))):
+                record = self._convert_to_record(idx)
+                record_line = json.dumps(
+                    {
+                        'utt': record['utt_id'],
+                        'feat': record['file_path'],
+                        'feat_shape': (record['duration'], ),
+                        'text': record['text'],
+                        'spk': record['spk_id'],
+                        'gender': record['spk_gender'],
+                    },
+                    ensure_ascii=False)
+                f.write(record_line + '\n')
+        logger.info(f'Manifest file {manifest_file} created.')
+
+    def __getitem__(self, idx):
+        record = self._convert_to_record(idx)
+        return tuple(record.values())
+
+    def __len__(self):
+        return len(self._data)
--- a/PaddleAudio/paddleaudio/datasets/ravdess.py
+++ b/PaddleAudio/paddleaudio/datasets/ravdess.py
@@ -40,23 +40,33 @@ class RAVDESS(AudioClassificationDataset):

    archieves = [
        {
-            'url': 'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
+            'url':
+            'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
            'md5': '5411230427d67a21e18aa4d466e6d1b9',
        },
        {
-            'url': 'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
+            'url':
+            'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
            'md5': 'bc696df654c87fed845eb13823edef8a',
        },
    ]
-    label_list = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
+    label_list = [
+        'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
+        'surprised'
+    ]
    meta_info = collections.namedtuple(
-        'META_INFO', ('modality', 'vocal_channel', 'emotion', 'emotion_intensity', 'statement', 'repitition', 'actor'))
+        'META_INFO', ('modality', 'vocal_channel', 'emotion',
+                      'emotion_intensity', 'statement', 'repitition', 'actor'))
    speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
    song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
-    sample_rate = 44100  # 44.1 khz
-    duration = 5  # 5s

-    def __init__(self, mode='train', seed=0, n_folds=5, split=1, feat_type='raw', **kwargs):
+    def __init__(self,
+                 mode='train',
+                 seed=0,
+                 n_folds=5,
+                 split=1,
+                 feat_type='raw',
+                 **kwargs):
        """
        Ags:
            mode (:obj:`str`, `optional`, defaults to `train`):
@@ -74,8 +84,6 @@ class RAVDESS(AudioClassificationDataset):
        files, labels = self._get_data(mode, seed, n_folds, split)
        super(RAVDESS, self).__init__(files=files,
                                      labels=labels,
-                                      sample_rate=self.sample_rate,
-                                      duration=self.duration,
                                      feat_type=feat_type,
                                      **kwargs)

@@ -86,8 +94,10 @@ class RAVDESS(AudioClassificationDataset):
            ret.append(self.meta_info(*basename_without_extend.split('-')))
        return ret

-    def _get_data(self, mode, seed, n_folds, split) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(self.speech_path) and not os.path.isdir(self.song_path):
+    def _get_data(self, mode, seed, n_folds,
+                  split) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(self.speech_path) and not os.path.isdir(
+                self.song_path):
            download_and_decompress(self.archieves, DATA_HOME)

        wav_files = []
@@ -102,7 +112,9 @@ class RAVDESS(AudioClassificationDataset):
                    wav_files.append(os.path.join(root, file))

        random.seed(seed)  # shuffle samples to split data
-        random.shuffle(wav_files)  # make sure using the same seed to create train and dev dataset
+        random.shuffle(
+            wav_files
+        )  # make sure using the same seed to create train and dev dataset
        meta_info = self._get_meta_info(wav_files)

        files = []
@@ -122,8 +134,3 @@ class RAVDESS(AudioClassificationDataset):
                labels.append(target)

        return files, labels
-
-
-if __name__ == "__main__":
-    train_ds = RAVDESS(mode='train', feat_type='mel_spect')
-    dev_ds = RAVDESS(mode='dev', feat_type='mel_spect')
--- a/PaddleAudio/paddleaudio/datasets/tess.py
+++ b/PaddleAudio/paddleaudio/datasets/tess.py
@@ -55,8 +55,6 @@ class TESS(AudioClassificationDataset):
    ]
    meta_info = collections.namedtuple('META_INFO', ('speaker', 'word', 'emotion'))
    audio_path = 'TESS_Toronto_emotional_speech_set'
-    sample_rate = 24414
-    duration = 2

    def __init__(self, mode='train', seed=0, n_folds=5, split=1, feat_type='raw', **kwargs):
        """
@@ -74,12 +72,7 @@ class TESS(AudioClassificationDataset):
        """
        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
        files, labels = self._get_data(mode, seed, n_folds, split)
-        super(TESS, self).__init__(files=files,
-                                   labels=labels,
-                                   sample_rate=self.sample_rate,
-                                   duration=self.duration,
-                                   feat_type=feat_type,
-                                   **kwargs)
+        super(TESS, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs)

    def _get_meta_info(self, files) -> List[collections.namedtuple]:
        ret = []

--- a/PaddleAudio/paddleaudio/datasets/urban_sound.py
+++ b/PaddleAudio/paddleaudio/datasets/urban_sound.py
@@ -47,17 +47,10 @@ class UrbanSound8K(AudioClassificationDataset):
    meta_info = collections.namedtuple('META_INFO',
                                       ('filename', 'fsid', 'start', 'end', 'salience', 'fold', 'class_id', 'label'))
    audio_path = os.path.join('UrbanSound8K', 'audio')
-    sample_rate = 48000  # 48 khz
-    duration = 4  # 4s

    def __init__(self, mode: str = 'train', split: int = 1, feat_type: str = 'raw', **kwargs):
        files, labels = self._get_data(mode, split)
-        super(UrbanSound8K, self).__init__(files=files,
-                                           labels=labels,
-                                           sample_rate=self.sample_rate,
-                                           duration=self.duration,
-                                           feat_type=feat_type,
-                                           **kwargs)
+        super(UrbanSound8K, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs)
        """
        Ags:
            mode (:obj:`str`, `optional`, defaults to `train`):

--- a/PaddleAudio/paddleaudio/utils/download.py
+++ b/PaddleAudio/paddleaudio/utils/download.py
@@ -17,24 +17,40 @@ from typing import Dict, List

 from paddle.framework import load as load_state_dict
 from paddle.utils import download
+from pathos.multiprocessing import ProcessPool

 from .log import logger

 download.logger = logger


-def download_and_decompress(archives: List[Dict[str, str]], path: str):
+def decompress(file: str):
+    """
+    Extracts all files from a compressed file.
+    """
+    assert os.path.isfile(file), "File: {} not exists.".format(file)
+    download._decompress(file)
+
+
+def download_and_decompress(archives: List[Dict[str, str]], path: str, n_workers: int = 0):
    """
    Download archieves and decompress to specific path.
    """
    if not os.path.isdir(path):
        os.makedirs(path)

-    for archive in archives:
-        assert 'url' in archive and 'md5' in archive, \
-            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+    if n_workers <= 0:
+        for archive in archives:
+            assert 'url' in archive and 'md5' in archive, \
+                'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'

-        download.get_path_from_url(archive['url'], path, archive['md5'])
+            download.get_path_from_url(archive['url'], path, archive['md5'])
+    else:
+        pool = ProcessPool(nodes=n_workers)
+        pool.imap(download.get_path_from_url, [_['url'] for _ in archives], [path] * len(archives),
+                  [_['md5'] for _ in archives])
+        pool.close()
+        pool.join()


 def load_state_dict_from_url(url: str, path: str, md5: str = None):