add waveform augment pipeline, test=doc

2d89c80e · xiongxinlei · ac4967e2 · 2d89c80e · 2d89c80e · 2d89c80e
8 changed file
--- a/examples/voxceleb/sv0/local/speaker_verification_cosine.py
+++ b/examples/voxceleb/sv0/local/speaker_verification_cosine.py
@@ -23,9 +23,13 @@ from paddle.io import DataLoader
 from tqdm import tqdm
 from paddleaudio.datasets.voxceleb import VoxCeleb1
+from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 from paddlespeech.vector.training.metrics import compute_eer
+from paddlespeech.vector.training.seeding import seed_everything
+logger = Log(__name__).getlog()
 def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
@@ -67,9 +71,19 @@ def feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
    return {'ids': ids, 'feats': feats, 'lengths': lengths}
+# feat configuration
+cpu_feat_conf = {
+    'n_mels': 80,
+    'window_size': 400,  #ms
+    'hop_length': 160,  #ms
+}
 def main(args):
    # stage0: set the training device, cpu or gpu
    paddle.set_device(args.device)
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(args.seed)
    # stage1: build the dnn backbone model network
    ##"channels": [1024, 1024, 1024, 1024, 3072],
@@ -95,19 +109,18 @@ def main(args):
    state_dict = paddle.load(
        os.path.join(args.load_checkpoint, 'model.pdparams'))
    model.set_state_dict(state_dict)
-    print(f'Checkpoint loaded from {args.load_checkpoint}')
+    logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
    # stage4: construct the enroll and test dataloader
    enrol_ds = VoxCeleb1(
        subset='enrol',
+        target_dir=args.data_dir,
        feat_type='melspectrogram',
        random_chunk=False,
-        n_mels=80,
+        **cpu_feat_conf)
-        window_size=400,
-        hop_length=160)
    enrol_sampler = BatchSampler(
        enrol_ds, batch_size=args.batch_size,
-        shuffle=True)  # Shuffle to make embedding normalization more robust.
+        shuffle=False)  # Shuffle to make embedding normalization more robust.
    enrol_loader = DataLoader(enrol_ds,
                    batch_sampler=enrol_sampler,
                    collate_fn=lambda x: feature_normalize(
@@ -117,14 +130,13 @@ def main(args):
    test_ds = VoxCeleb1(
        subset='test',
+        target_dir=args.data_dir,
        feat_type='melspectrogram',
        random_chunk=False,
-        n_mels=80,
+        **cpu_feat_conf)
-        window_size=400,
-        hop_length=160)
    test_sampler = BatchSampler(
-        test_ds, batch_size=args.batch_size, shuffle=True)
+        test_ds, batch_size=args.batch_size, shuffle=False)
    test_loader = DataLoader(test_ds,
                            batch_sampler=test_sampler,
                            collate_fn=lambda x: feature_normalize(
@@ -136,10 +148,10 @@ def main(args):
    # stage7: global embedding norm to imporve the performance
    if args.global_embedding_norm:
-        embedding_mean = None
+        global_embedding_mean = None
-        embedding_std = None
+        global_embedding_std = None
-        mean_norm = args.embedding_mean_norm
+        mean_norm_flag = args.embedding_mean_norm
-        std_norm = args.embedding_std_norm
+        std_norm_flag = args.embedding_std_norm
        batch_count = 0
    # stage8: Compute embeddings of audios in enrol and test dataset from model.
@@ -147,7 +159,7 @@ def main(args):
    # Run multi times to make embedding normalization more stable.
    for i in range(2):
        for dl in [enrol_loader, test_loader]:
-            print(
+            logger.info(
                f'Loop {[i+1]}: Computing embeddings on {dl.dataset.subset} dataset'
            )
            with paddle.no_grad():
@@ -162,20 +174,24 @@ def main(args):
                    # Global embedding normalization.
                    if args.global_embedding_norm:
                        batch_count += 1
-                        mean = embeddings.mean(axis=0) if mean_norm else 0
+                        current_mean = embeddings.mean(
-                        std = embeddings.std(axis=0) if std_norm else 1
+                            axis=0) if mean_norm_flag else 0
+                        current_std = embeddings.std(
+                            axis=0) if std_norm_flag else 1
                        # Update global mean and std.
-                        if embedding_mean is None and embedding_std is None:
+                        if global_embedding_mean is None and global_embedding_std is None:
-                            embedding_mean, embedding_std = mean, std
+                            global_embedding_mean, global_embedding_std = current_mean, current_std
                        else:
                            weight = 1 / batch_count  # Weight decay by batches.
-                            embedding_mean = (1 - weight
+                            global_embedding_mean = (
-                                              ) * embedding_mean + weight * mean
+                                1 - weight
-                            embedding_std = (1 - weight
+                            ) * global_embedding_mean + weight * current_mean
-                                             ) * embedding_std + weight * std
+                            global_embedding_std = (
+                                1 - weight
+                            ) * global_embedding_std + weight * current_std
                        # Apply global embedding normalization.
-                        embeddings = (
+                        embeddings = (embeddings - global_embedding_mean
-                            embeddings - embedding_mean) / embedding_std
+                                      ) / global_embedding_std
                    # Update embedding dict.
                    id2embedding.update(dict(zip(ids, embeddings)))
@@ -198,7 +214,7 @@ def main(args):
                                             ])  # (N, emb_size)
    scores = cos_sim_func(enrol_embeddings, test_embeddings)
    EER, threshold = compute_eer(np.asarray(labels), scores.numpy())
-    print(
+    logger.info(
        f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}'
    )
@@ -210,10 +226,18 @@ if __name__ == "__main__":
                        choices=['cpu', 'gpu'],
                        default="gpu",
                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--seed",
+                        default=0,
+                        type=int,
+                        help="random seed for paddle, numpy and python random package")
+    parser.add_argument("--data-dir",
+                        default="./data/",
+                        type=str,
+                        help="data directory")
    parser.add_argument("--batch-size",
                        type=int,
                        default=16,
-                        help="Total examples' number in batch for training.")
+                        help="Total examples' number in batch for extract the embedding.")
    parser.add_argument("--num-workers",
                        type=int,
                        default=0,

--- a/examples/voxceleb/sv0/local/train.py
+++ b/examples/voxceleb/sv0/local/train.py
@@ -22,6 +22,9 @@ from paddle.io import DistributedBatchSampler
 from paddleaudio.datasets.voxceleb import VoxCeleb1
 from paddleaudio.features.core import melspectrogram
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.augment import build_augment_pipeline
+from paddlespeech.vector.io.augment import waveform_augment
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.io.batch import waveform_collate_fn
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
@@ -29,8 +32,11 @@ from paddlespeech.vector.modules.loss import AdditiveAngularMargin
 from paddlespeech.vector.modules.loss import LogSoftmaxWrapper
 from paddlespeech.vector.modules.lr import CyclicLRScheduler
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.seeding import seed_everything
 from paddlespeech.vector.utils.time import Timer
+logger = Log(__name__).getlog()
 # feat configuration
 cpu_feat_conf = {
    'n_mels': 80,
@@ -47,12 +53,19 @@ def main(args):
    paddle.distributed.init_parallel_env()
    nranks = paddle.distributed.get_world_size()
    local_rank = paddle.distributed.get_rank()
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(args.seed)
-    # stage2: data prepare
+    # stage2: data prepare, such vox1 and vox2 data, and augment data and pipline
-    # note: some cmd must do in rank==0
+    # note: some cmd must do in rank==0, so wo will refactor the data prepare code
    train_ds = VoxCeleb1('train', target_dir=args.data_dir)
    dev_ds = VoxCeleb1('dev', target_dir=args.data_dir)
+    if args.augment:
+        augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
+    else:
+        augment_pipeline = []
    # stage3: build the dnn backbone model network
    #"channels": [1024, 1024, 1024, 1024, 3072],
    model_conf = {
@@ -83,7 +96,7 @@ def main(args):
    #         if pre-trained model exists, start epoch confirmed by the pre-trained model
    start_epoch = 0
    if args.load_checkpoint:
-        print("load the check point")
+        logger.info("load the check point")
        args.load_checkpoint = os.path.abspath(
            os.path.expanduser(args.load_checkpoint))
        try:
@@ -97,14 +110,14 @@ def main(args):
                os.path.join(args.load_checkpoint, 'model.pdopt'))
            optimizer.set_state_dict(state_dict)
            if local_rank == 0:
-                print(f'Checkpoint loaded from {args.load_checkpoint}')
+                logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
        except FileExistsError:
            if local_rank == 0:
-                print('Train from scratch.')
+                logger.info('Train from scratch.')
        try:
            start_epoch = int(args.load_checkpoint[-1])
-            print(f'Restore training from epoch {start_epoch}.')
+            logger.info(f'Restore training from epoch {start_epoch}.')
        except ValueError:
            pass
@@ -137,7 +150,10 @@ def main(args):
            waveforms, labels = batch['waveforms'], batch['labels']
            # stage 9-2: audio sample augment method, which is done on the audio sample point
-            # todo
+            if len(augment_pipeline) != 0:
+                waveforms = waveform_augment(waveforms, augment_pipeline)
+                labels = paddle.concat(
+                    [labels for i in range(len(augment_pipeline) + 1)])
            # stage 9-3: extract the audio feats,such fbank, mfcc, spectrogram
            feats = []
@@ -185,7 +201,7 @@ def main(args):
                print_msg += ' acc={:.4f}'.format(avg_acc)
                print_msg += ' lr={:.4E} step/sec={:.2f} | ETA {}'.format(
                    lr, timer.timing, timer.eta)
-                print(print_msg)
+                logger.info(print_msg)
                avg_loss = 0
                num_corrects = 0
@@ -217,7 +233,7 @@ def main(args):
            num_samples = 0
            # stage 9-13: evaluation the valid dataset batch data
-            print('Evaluate on validation dataset')
+            logger.info('Evaluate on validation dataset')
            with paddle.no_grad():
                for batch_idx, batch in enumerate(dev_loader):
                    waveforms, labels = batch['waveforms'], batch['labels']
@@ -238,12 +254,12 @@ def main(args):
            print_msg = '[Evaluation result]'
            print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
-            print(print_msg)
+            logger.info(print_msg)
            # stage 9-14: Save model parameters
            save_dir = os.path.join(args.checkpoint_dir,
                                    'epoch_{}'.format(epoch))
-            print('Saving model checkpoint to {}'.format(save_dir))
+            logger.info('Saving model checkpoint to {}'.format(save_dir))
            paddle.save(model.state_dict(),
                        os.path.join(save_dir, 'model.pdparams'))
            paddle.save(optimizer.state_dict(),
@@ -260,6 +276,10 @@ if __name__ == "__main__":
                        choices=['cpu', 'gpu'],
                        default="cpu",
                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--seed",
+                        default=0,
+                        type=int,
+                        help="random seed for paddle, numpy and python random package")
    parser.add_argument("--data-dir",
                        default="./data/",
                        type=str,
@@ -295,6 +315,10 @@ if __name__ == "__main__":
                        type=str,
                        default='./checkpoint',
                        help="Directory to save model checkpoints.")
+    parser.add_argument("--augment",
+                        action="store_true",
+                        default=False,
+                        help="Apply audio augments.")
    args = parser.parse_args()
    # yapf: enable

--- a/paddleaudio/datasets/rirs_noises.py
+++ b/paddleaudio/datasets/rirs_noises.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import csv
+import glob
+import os
+import random
+from typing import Dict
+from typing import List
+from typing import Tuple
+from paddle.io import Dataset
+from tqdm import tqdm
+from paddleaudio.backends import load as load_audio
+from paddleaudio.backends import save_wav
+from paddleaudio.datasets.dataset import feat_funcs
+from paddleaudio.utils import DATA_HOME
+from paddleaudio.utils import decompress
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.utils.download import download_and_decompress
+logger = Log(__name__).getlog()
+__all__ = ['OpenRIRNoise']
+class OpenRIRNoise(Dataset):
+    archieves = [
+        {
+            'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
+            'md5': 'e6f48e257286e05de56413b4779d8ffb',
+        },
+    ]
+    sample_rate = 16000
+    meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav'))
+    base_path = os.path.join(DATA_HOME, 'open_rir_noise')
+    wav_path = os.path.join(base_path, 'RIRS_NOISES')
+    csv_path = os.path.join(base_path, 'csv')
+    subsets = ['rir', 'noise']
+    def __init__(self,
+                 subset: str='rir',
+                 feat_type: str='raw',
+                 target_dir=None,
+                 random_chunk: bool=True,
+                 chunk_duration: float=3.0,
+                 seed: int=0,
+                 **kwargs):
+        assert subset in self.subsets, \
+            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
+        self.subset = subset
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self.random_chunk = random_chunk
+        self.chunk_duration = chunk_duration
+        self.csv_path = os.path.join(target_dir, "open_rir_noise",
+                                     "csv") if target_dir else self.csv_path
+        self._data = self._get_data()
+        super(OpenRIRNoise, self).__init__()
+        # Set up a seed to reproduce training or predicting result.
+        # random.seed(seed)
+    def _get_data(self):
+        # Download audio files.
+        logger.info(f"rirs noises base path: {self.base_path}")
+        if not os.path.isdir(self.base_path):
+            download_and_decompress(
+                self.archieves, self.base_path, decompress=True)
+        else:
+            logger.info(
+                f"{self.base_path} already exists, we will not download and decompress again"
+            )
+        # Data preparation.
+        logger.info(f"prepare the csv to {self.csv_path}")
+        if not os.path.isdir(self.csv_path):
+            os.makedirs(self.csv_path)
+            self.prepare_data()
+        data = []
+        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                audio_id, duration, wav = line.strip().split(',')
+                data.append(self.meta_info(audio_id, float(duration), wav))
+        random.shuffle(data)
+        return data
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+        waveform, sr = load_audio(record['wav'])
+        assert self.feat_type in feat_funcs.keys(), \
+            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(
+            waveform, sr=sr, **self.feat_config) if feat_func else waveform
+        record.update({'feat': feat})
+        return record
+    @staticmethod
+    def _get_chunks(seg_dur, audio_id, audio_duration):
+        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
+        chunk_lst = [
+            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
+            for i in range(num_chunks)
+        ]
+        return chunk_lst
+    def _get_audio_info(self, wav_file: str,
+                        split_chunks: bool) -> List[List[str]]:
+        waveform, sr = load_audio(wav_file)
+        audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
+        audio_duration = waveform.shape[0] / sr
+        ret = []
+        if split_chunks and audio_duration > self.chunk_duration:  # Split into pieces of self.chunk_duration seconds.
+            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
+                                                audio_duration)
+            for idx, chunk in enumerate(uniq_chunks_list):
+                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
+                start_sample = int(float(s) * sr)
+                end_sample = int(float(e) * sr)
+                new_wav_file = os.path.join(self.base_path,
+                                            audio_id + f'_chunk_{idx+1:02}.wav')
+                save_wav(waveform[start_sample:end_sample], sr, new_wav_file)
+                # id, duration, new_wav
+                ret.append([chunk, self.chunk_duration, new_wav_file])
+        else:  # Keep whole audio.
+            ret.append([audio_id, audio_duration, wav_file])
+        return ret
+    def generate_csv(self,
+                     wav_files: List[str],
+                     output_file: str,
+                     split_chunks: bool=True):
+        logger.info(f'Generating csv: {output_file}')
+        header = ["id", "duration", "wav"]
+        infos = list(
+            tqdm(
+                map(self._get_audio_info, wav_files, [split_chunks] * len(
+                    wav_files)),
+                total=len(wav_files)))
+        csv_lines = []
+        for info in infos:
+            csv_lines.extend(info)
+        with open(output_file, mode="w") as csv_f:
+            csv_writer = csv.writer(
+                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
+            csv_writer.writerow(header)
+            for line in csv_lines:
+                csv_writer.writerow(line)
+    def prepare_data(self):
+        rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises",
+                                "rir_list")
+        rir_files = []
+        with open(rir_list, 'r') as f:
+            for line in f.readlines():
+                rir_file = line.strip().split(' ')[-1]
+                rir_files.append(os.path.join(self.base_path, rir_file))
+        noise_list = os.path.join(self.wav_path, "pointsource_noises",
+                                  "noise_list")
+        noise_files = []
+        with open(noise_list, 'r') as f:
+            for line in f.readlines():
+                noise_file = line.strip().split(' ')[-1]
+                noise_files.append(os.path.join(self.base_path, noise_file))
+        self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv'))
+        self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv'))
+    def __getitem__(self, idx):
+        return self._convert_to_record(idx)
+    def __len__(self):
+        return len(self._data)
--- a/paddleaudio/datasets/voxceleb.py
+++ b/paddleaudio/datasets/voxceleb.py
@@ -29,9 +29,12 @@ from paddleaudio.datasets.dataset import feat_funcs
 from paddleaudio.utils import DATA_HOME
 from paddleaudio.utils import decompress
 from paddleaudio.utils import download_and_decompress
+from paddlespeech.s2t.utils.log import Log
 from utils.utility import download
 from utils.utility import unpack
+logger = Log(__name__).getlog()
 __all__ = ['VoxCeleb1']
@@ -121,9 +124,9 @@ class VoxCeleb1(Dataset):
        # Download audio files.
        # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
        # so, we check the vox1/wav dir status
-        print("wav base path: {}".format(self.wav_path))
+        logger.info(f"wav base path: {self.wav_path}")
        if not os.path.isdir(self.wav_path):
-            print("start to download the voxceleb1 dataset")
+            logger.info(f"start to download the voxceleb1 dataset")
            download_and_decompress(  # multi-zip parts concatenate to vox1_dev_wav.zip
                self.archieves_audio_dev,
                self.base_path,
@@ -135,7 +138,7 @@ class VoxCeleb1(Dataset):
            # Download all parts and concatenate the files into one zip file.
            dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
-            print(f'Concatenating all parts to: {dev_zipfile}')
+            logger.info(f'Concatenating all parts to: {dev_zipfile}')
            os.system(
                f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
            )
@@ -154,6 +157,9 @@ class VoxCeleb1(Dataset):
            self.prepare_data()
        data = []
+        logger.info(
+            f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
+        )
        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
            for line in rf.readlines()[1:]:
                audio_id, duration, wav, start, stop, spk_id = line.strip(
@@ -246,7 +252,7 @@ class VoxCeleb1(Dataset):
                     wav_files: List[str],
                     output_file: str,
                     split_chunks: bool=True):
-        print(f'Generating csv: {output_file}')
+        logger.info(f'Generating csv: {output_file}')
        header = ["id", "duration", "wav", "start", "stop", "spk_id"]
        with Pool(64) as p:
@@ -269,7 +275,7 @@ class VoxCeleb1(Dataset):
    def prepare_data(self):
        # Audio of speakers in veri_test_file should not be included in training set.
-        print("start to prepare the data csv file")
+        logger.info("start to prepare the data csv file")
        enrol_files = set()
        test_files = set()
        # get the enroll and test audio file path
@@ -299,7 +305,7 @@ class VoxCeleb1(Dataset):
                speakers.add(spk)
                audio_files.append(file)
-        print("start to generate the {}".format(
+        logger.info("start to generate the {}".format(
            os.path.join(self.meta_path, 'spk_id2label.txt')))
        # encode the train and dev speakers label to spk_id2label.txt
        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:

--- a/paddlespeech/vector/io/augment.py
+++ b/paddlespeech/vector/io/augment.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import os
+from typing import List
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddleaudio.backends import load as load_audio
+from paddleaudio.datasets.rirs_noises import OpenRIRNoise
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.signal_processing import compute_amplitude
+from paddlespeech.vector.io.signal_processing import convolve1d
+from paddlespeech.vector.io.signal_processing import dB_to_amplitude
+from paddlespeech.vector.io.signal_processing import notch_filter
+from paddlespeech.vector.io.signal_processing import reverberate
+logger = Log(__name__).getlog()
+# TODO: Complete type-hint and doc string.
+class DropFreq(nn.Layer):
+    def __init__(
+            self,
+            drop_freq_low=1e-14,
+            drop_freq_high=1,
+            drop_count_low=1,
+            drop_count_high=2,
+            drop_width=0.05,
+            drop_prob=1, ):
+        super(DropFreq, self).__init__()
+        self.drop_freq_low = drop_freq_low
+        self.drop_freq_high = drop_freq_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_width = drop_width
+        self.drop_prob = drop_prob
+    def forward(self, waveforms):
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        dropped_waveform = waveforms.clone()
+        if paddle.rand([1]) > self.drop_prob:
+            return dropped_waveform
+        # Add channels dimension
+        if len(waveforms.shape) == 2:
+            dropped_waveform = dropped_waveform.unsqueeze(-1)
+        # Pick number of frequencies to drop
+        drop_count = paddle.randint(
+            low=self.drop_count_low, high=self.drop_count_high + 1, shape=[1])
+        # Pick a frequency to drop
+        drop_range = self.drop_freq_high - self.drop_freq_low
+        drop_frequency = (
+            paddle.rand([drop_count]) * drop_range + self.drop_freq_low)
+        # Filter parameters
+        filter_length = 101
+        pad = filter_length // 2
+        # Start with delta function
+        drop_filter = paddle.zeros([1, filter_length, 1])
+        drop_filter[0, pad, 0] = 1
+        # Subtract each frequency
+        for frequency in drop_frequency:
+            notch_kernel = notch_filter(frequency, filter_length,
+                                        self.drop_width)
+            drop_filter = convolve1d(drop_filter, notch_kernel, pad)
+        # Apply filter
+        dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
+        # Remove channels dimension if added
+        return dropped_waveform.squeeze(-1)
+class DropChunk(nn.Layer):
+    def __init__(
+            self,
+            drop_length_low=100,
+            drop_length_high=1000,
+            drop_count_low=1,
+            drop_count_high=10,
+            drop_start=0,
+            drop_end=None,
+            drop_prob=1,
+            noise_factor=0.0, ):
+        super(DropChunk, self).__init__()
+        self.drop_length_low = drop_length_low
+        self.drop_length_high = drop_length_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_start = drop_start
+        self.drop_end = drop_end
+        self.drop_prob = drop_prob
+        self.noise_factor = noise_factor
+        # Validate low < high
+        if drop_length_low > drop_length_high:
+            raise ValueError("Low limit must not be more than high limit")
+        if drop_count_low > drop_count_high:
+            raise ValueError("Low limit must not be more than high limit")
+        # Make sure the length doesn't exceed end - start
+        if drop_end is not None and drop_end >= 0:
+            if drop_start > drop_end:
+                raise ValueError("Low limit must not be more than high limit")
+            drop_range = drop_end - drop_start
+            self.drop_length_low = min(drop_length_low, drop_range)
+            self.drop_length_high = min(drop_length_high, drop_range)
+    def forward(self, waveforms, lengths):
+        # Reading input list
+        lengths = (lengths * waveforms.shape[1]).astype('int64')
+        batch_size = waveforms.shape[0]
+        dropped_waveform = waveforms.clone()
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        if paddle.rand([1]) > self.drop_prob:
+            return dropped_waveform
+        # Store original amplitude for computing white noise amplitude
+        clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1))
+        # Pick a number of times to drop
+        drop_times = paddle.randint(
+            low=self.drop_count_low,
+            high=self.drop_count_high + 1,
+            shape=[batch_size], )
+        # Iterate batch to set mask
+        for i in range(batch_size):
+            if drop_times[i] == 0:
+                continue
+            # Pick lengths
+            length = paddle.randint(
+                low=self.drop_length_low,
+                high=self.drop_length_high + 1,
+                shape=[drop_times[i]], )
+            # Compute range of starting locations
+            start_min = self.drop_start
+            if start_min < 0:
+                start_min += lengths[i]
+            start_max = self.drop_end
+            if start_max is None:
+                start_max = lengths[i]
+            if start_max < 0:
+                start_max += lengths[i]
+            start_max = max(0, start_max - length.max())
+            # Pick starting locations
+            start = paddle.randint(
+                low=start_min,
+                high=start_max + 1,
+                shape=[drop_times[i]], )
+            end = start + length
+            # Update waveform
+            if not self.noise_factor:
+                for j in range(drop_times[i]):
+                    dropped_waveform[i, start[j]:end[j]] = 0.0
+            else:
+                # Uniform distribution of -2 to +2 * avg amplitude should
+                # preserve the average for normalization
+                noise_max = 2 * clean_amplitude[i] * self.noise_factor
+                for j in range(drop_times[i]):
+                    # zero-center the noise distribution
+                    noise_vec = paddle.rand([length[j]], dtype='float32')
+                    noise_vec = 2 * noise_max * noise_vec - noise_max
+                    dropped_waveform[i, int(start[j]):int(end[j])] = noise_vec
+        return dropped_waveform
+class Resample(nn.Layer):
+    def __init__(
+            self,
+            orig_freq=16000,
+            new_freq=16000,
+            lowpass_filter_width=6, ):
+        super(Resample, self).__init__()
+        self.orig_freq = orig_freq
+        self.new_freq = new_freq
+        self.lowpass_filter_width = lowpass_filter_width
+        # Compute rate for striding
+        self._compute_strides()
+        assert self.orig_freq % self.conv_stride == 0
+        assert self.new_freq % self.conv_transpose_stride == 0
+    def _compute_strides(self):
+        # Compute new unit based on ratio of in/out frequencies
+        base_freq = math.gcd(self.orig_freq, self.new_freq)
+        input_samples_in_unit = self.orig_freq // base_freq
+        self.output_samples = self.new_freq // base_freq
+        # Store the appropriate stride based on the new units
+        self.conv_stride = input_samples_in_unit
+        self.conv_transpose_stride = self.output_samples
+    def forward(self, waveforms):
+        if not hasattr(self, "first_indices"):
+            self._indices_and_weights(waveforms)
+        # Don't do anything if the frequencies are the same
+        if self.orig_freq == self.new_freq:
+            return waveforms
+        unsqueezed = False
+        if len(waveforms.shape) == 2:
+            waveforms = waveforms.unsqueeze(1)
+            unsqueezed = True
+        elif len(waveforms.shape) == 3:
+            waveforms = waveforms.transpose([0, 2, 1])
+        else:
+            raise ValueError("Input must be 2 or 3 dimensions")
+        # Do resampling
+        resampled_waveform = self._perform_resample(waveforms)
+        if unsqueezed:
+            resampled_waveform = resampled_waveform.squeeze(1)
+        else:
+            resampled_waveform = resampled_waveform.transpose([0, 2, 1])
+        return resampled_waveform
+    def _perform_resample(self, waveforms):
+        # Compute output size and initialize
+        batch_size, num_channels, wave_len = waveforms.shape
+        window_size = self.weights.shape[1]
+        tot_output_samp = self._output_samples(wave_len)
+        resampled_waveform = paddle.zeros((batch_size, num_channels,
+                                           tot_output_samp))
+        # eye size: (num_channels, num_channels, 1)
+        eye = paddle.eye(num_channels).unsqueeze(2)
+        # Iterate over the phases in the polyphase filter
+        for i in range(self.first_indices.shape[0]):
+            wave_to_conv = waveforms
+            first_index = int(self.first_indices[i].item())
+            if first_index >= 0:
+                # trim the signal as the filter will not be applied
+                # before the first_index
+                wave_to_conv = wave_to_conv[:, :, first_index:]
+            # pad the right of the signal to allow partial convolutions
+            # meaning compute values for partial windows (e.g. end of the
+            # window is outside the signal length)
+            max_index = (tot_output_samp - 1) // self.output_samples
+            end_index = max_index * self.conv_stride + window_size
+            current_wave_len = wave_len - first_index
+            right_padding = max(0, end_index + 1 - current_wave_len)
+            left_padding = max(0, -first_index)
+            wave_to_conv = paddle.nn.functional.pad(
+                wave_to_conv, [left_padding, right_padding], data_format='NCL')
+            conv_wave = paddle.nn.functional.conv1d(
+                x=wave_to_conv,
+                # weight=self.weights[i].repeat(num_channels, 1, 1),
+                weight=self.weights[i].expand((num_channels, 1, -1)),
+                stride=self.conv_stride,
+                groups=num_channels, )
+            # we want conv_wave[:, i] to be at
+            # output[:, i + n*conv_transpose_stride]
+            dilated_conv_wave = paddle.nn.functional.conv1d_transpose(
+                conv_wave, eye, stride=self.conv_transpose_stride)
+            # pad dilated_conv_wave so it reaches the output length if needed.
+            left_padding = i
+            previous_padding = left_padding + dilated_conv_wave.shape[-1]
+            right_padding = max(0, tot_output_samp - previous_padding)
+            dilated_conv_wave = paddle.nn.functional.pad(
+                dilated_conv_wave, [left_padding, right_padding],
+                data_format='NCL')
+            dilated_conv_wave = dilated_conv_wave[:, :, :tot_output_samp]
+            resampled_waveform += dilated_conv_wave
+        return resampled_waveform
+    def _output_samples(self, input_num_samp):
+        samp_in = int(self.orig_freq)
+        samp_out = int(self.new_freq)
+        tick_freq = abs(samp_in * samp_out) // math.gcd(samp_in, samp_out)
+        ticks_per_input_period = tick_freq // samp_in
+        # work out the number of ticks in the time interval
+        # [ 0, input_num_samp/samp_in ).
+        interval_length = input_num_samp * ticks_per_input_period
+        if interval_length <= 0:
+            return 0
+        ticks_per_output_period = tick_freq // samp_out
+        # Get the last output-sample in the closed interval,
+        # i.e. replacing [ ) with [ ]. Note: integer division rounds down.
+        # See http://en.wikipedia.org/wiki/Interval_(mathematics) for an
+        # explanation of the notation.
+        last_output_samp = interval_length // ticks_per_output_period
+        # We need the last output-sample in the open interval, so if it
+        # takes us to the end of the interval exactly, subtract one.
+        if last_output_samp * ticks_per_output_period == interval_length:
+            last_output_samp -= 1
+        # First output-sample index is zero, so the number of output samples
+        # is the last output-sample plus one.
+        num_output_samp = last_output_samp + 1
+        return num_output_samp
+    def _indices_and_weights(self, waveforms):
+        # Lowpass filter frequency depends on smaller of two frequencies
+        min_freq = min(self.orig_freq, self.new_freq)
+        lowpass_cutoff = 0.99 * 0.5 * min_freq
+        assert lowpass_cutoff * 2 <= min_freq
+        window_width = self.lowpass_filter_width / (2.0 * lowpass_cutoff)
+        assert lowpass_cutoff < min(self.orig_freq, self.new_freq) / 2
+        output_t = paddle.arange(start=0.0, end=self.output_samples)
+        output_t /= self.new_freq
+        min_t = output_t - window_width
+        max_t = output_t + window_width
+        min_input_index = paddle.ceil(min_t * self.orig_freq)
+        max_input_index = paddle.floor(max_t * self.orig_freq)
+        num_indices = max_input_index - min_input_index + 1
+        max_weight_width = num_indices.max()
+        j = paddle.arange(max_weight_width, dtype='float32')
+        input_index = min_input_index.unsqueeze(1) + j.unsqueeze(0)
+        delta_t = (input_index / self.orig_freq) - output_t.unsqueeze(1)
+        weights = paddle.zeros_like(delta_t)
+        inside_window_indices = delta_t.abs().less_than(
+            paddle.to_tensor(window_width))
+        # raised-cosine (Hanning) window with width `window_width`
+        weights[inside_window_indices] = 0.5 * (1 + paddle.cos(
+            2 * math.pi * lowpass_cutoff / self.lowpass_filter_width *
+            delta_t.masked_select(inside_window_indices)))
+        t_eq_zero_indices = delta_t.equal(paddle.zeros_like(delta_t))
+        t_not_eq_zero_indices = delta_t.not_equal(paddle.zeros_like(delta_t))
+        # sinc filter function
+        weights = paddle.where(
+            t_not_eq_zero_indices,
+            weights * paddle.sin(2 * math.pi * lowpass_cutoff * delta_t) /
+            (math.pi * delta_t), weights)
+        # limit of the function at t = 0
+        weights = paddle.where(t_eq_zero_indices, weights * 2 * lowpass_cutoff,
+                               weights)
+        # size (output_samples, max_weight_width)
+        weights /= self.orig_freq
+        self.first_indices = min_input_index
+        self.weights = weights
+class SpeedPerturb(nn.Layer):
+    def __init__(
+            self,
+            orig_freq,
+            speeds=[90, 100, 110],
+            perturb_prob=1.0, ):
+        super(SpeedPerturb, self).__init__()
+        self.orig_freq = orig_freq
+        self.speeds = speeds
+        self.perturb_prob = perturb_prob
+        # Initialize index of perturbation
+        self.samp_index = 0
+        # Initialize resamplers
+        self.resamplers = []
+        for speed in self.speeds:
+            config = {
+                "orig_freq": self.orig_freq,
+                "new_freq": self.orig_freq * speed // 100,
+            }
+            self.resamplers.append(Resample(**config))
+    def forward(self, waveform):
+        # Don't perturb (return early) 1-`perturb_prob` portion of the batches
+        if paddle.rand([1]) > self.perturb_prob:
+            return waveform.clone()
+        # Perform a random perturbation
+        self.samp_index = paddle.randint(len(self.speeds), shape=[1]).item()
+        perturbed_waveform = self.resamplers[self.samp_index](waveform)
+        return perturbed_waveform
+class AddNoise(nn.Layer):
+    def __init__(
+            self,
+            noise_dataset=None,  # None for white noise
+            num_workers=0,
+            snr_low=0,
+            snr_high=0,
+            mix_prob=1.0,
+            start_index=None,
+            normalize=False, ):
+        super(AddNoise, self).__init__()
+        self.num_workers = num_workers
+        self.snr_low = snr_low
+        self.snr_high = snr_high
+        self.mix_prob = mix_prob
+        self.start_index = start_index
+        self.normalize = normalize
+        self.noise_dataset = noise_dataset
+        self.noise_dataloader = None
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+        # Copy clean waveform to initialize noisy waveform
+        noisy_waveform = waveforms.clone()
+        lengths = (lengths * waveforms.shape[1]).astype('int64').unsqueeze(1)
+        # Don't add noise (return early) 1-`mix_prob` portion of the batches
+        if paddle.rand([1]) > self.mix_prob:
+            return noisy_waveform
+        # Compute the average amplitude of the clean waveforms
+        clean_amplitude = compute_amplitude(waveforms, lengths)
+        # Pick an SNR and use it to compute the mixture amplitude factors
+        SNR = paddle.rand((len(waveforms), 1))
+        SNR = SNR * (self.snr_high - self.snr_low) + self.snr_low
+        noise_amplitude_factor = 1 / (dB_to_amplitude(SNR) + 1)
+        new_noise_amplitude = noise_amplitude_factor * clean_amplitude
+        # Scale clean signal appropriately
+        noisy_waveform *= 1 - noise_amplitude_factor
+        # Loop through clean samples and create mixture
+        if self.noise_dataset is None:
+            white_noise = paddle.normal(shape=waveforms.shape)
+            noisy_waveform += new_noise_amplitude * white_noise
+        else:
+            tensor_length = waveforms.shape[1]
+            noise_waveform, noise_length = self._load_noise(
+                lengths,
+                tensor_length, )
+            # Rescale and add
+            noise_amplitude = compute_amplitude(noise_waveform, noise_length)
+            noise_waveform *= new_noise_amplitude / (noise_amplitude + 1e-14)
+            noisy_waveform += noise_waveform
+        # Normalizing to prevent clipping
+        if self.normalize:
+            abs_max, _ = paddle.max(
+                paddle.abs(noisy_waveform), axis=1, keepdim=True)
+            noisy_waveform = noisy_waveform / abs_max.clip(min=1.0)
+        return noisy_waveform
+    def _load_noise(self, lengths, max_length):
+        """
+        Load a batch of noises
+        args
+        lengths(Paddle.Tensor): Num samples of waveforms with shape (N, 1).
+        max_length(int): Width of a batch.
+        """
+        lengths = lengths.squeeze(1)
+        batch_size = len(lengths)
+        # Load a noise batch
+        if self.noise_dataloader is None:
+            def noise_collate_fn(batch):
+                def pad(x, target_length, mode='constant', **kwargs):
+                    x = np.asarray(x)
+                    w = target_length - x.shape[0]
+                    assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}'
+                    return np.pad(x, [0, w], mode=mode, **kwargs)
+                ids = [item['id'] for item in batch]
+                lengths = np.asarray([item['feat'].shape[0] for item in batch])
+                waveforms = list(
+                    map(lambda x: pad(x, max(max_length, lengths.max().item())),
+                        [item['feat'] for item in batch]))
+                waveforms = np.stack(waveforms)
+                return {'ids': ids, 'feats': waveforms, 'lengths': lengths}
+            # Create noise data loader.
+            self.noise_dataloader = paddle.io.DataLoader(
+                self.noise_dataset,
+                batch_size=batch_size,
+                shuffle=True,
+                num_workers=self.num_workers,
+                collate_fn=noise_collate_fn,
+                return_list=True, )
+            self.noise_data = iter(self.noise_dataloader)
+        noise_batch, noise_len = self._load_noise_batch_of_size(batch_size)
+        # Select a random starting location in the waveform
+        start_index = self.start_index
+        if self.start_index is None:
+            start_index = 0
+            max_chop = (noise_len - lengths).min().clip(min=1)
+            start_index = paddle.randint(high=max_chop, shape=[1])
+        # Truncate noise_batch to max_length
+        noise_batch = noise_batch[:, start_index:start_index + max_length]
+        noise_len = (noise_len - start_index).clip(max=max_length).unsqueeze(1)
+        return noise_batch, noise_len
+    def _load_noise_batch_of_size(self, batch_size):
+        """Concatenate noise batches, then chop to correct size"""
+        noise_batch, noise_lens = self._load_noise_batch()
+        # Expand
+        while len(noise_batch) < batch_size:
+            noise_batch = paddle.concat((noise_batch, noise_batch))
+            noise_lens = paddle.concat((noise_lens, noise_lens))
+        # Contract
+        if len(noise_batch) > batch_size:
+            noise_batch = noise_batch[:batch_size]
+            noise_lens = noise_lens[:batch_size]
+        return noise_batch, noise_lens
+    def _load_noise_batch(self):
+        """Load a batch of noises, restarting iteration if necessary."""
+        try:
+            batch = next(self.noise_data)
+        except StopIteration:
+            self.noise_data = iter(self.noise_dataloader)
+            batch = next(self.noise_data)
+        noises, lens = batch['feats'], batch['lengths']
+        return noises, lens
+class AddReverb(nn.Layer):
+    def __init__(
+            self,
+            rir_dataset,
+            reverb_prob=1.0,
+            rir_scale_factor=1.0,
+            num_workers=0, ):
+        super(AddReverb, self).__init__()
+        self.rir_dataset = rir_dataset
+        self.reverb_prob = reverb_prob
+        self.rir_scale_factor = rir_scale_factor
+        # Create rir data loader.
+        def rir_collate_fn(batch):
+            def pad(x, target_length, mode='constant', **kwargs):
+                x = np.asarray(x)
+                w = target_length - x.shape[0]
+                assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}'
+                return np.pad(x, [0, w], mode=mode, **kwargs)
+            ids = [item['id'] for item in batch]
+            lengths = np.asarray([item['feat'].shape[0] for item in batch])
+            waveforms = list(
+                map(lambda x: pad(x, lengths.max().item()),
+                    [item['feat'] for item in batch]))
+            waveforms = np.stack(waveforms)
+            return {'ids': ids, 'feats': waveforms, 'lengths': lengths}
+        self.rir_dataloader = paddle.io.DataLoader(
+            self.rir_dataset,
+            collate_fn=rir_collate_fn,
+            num_workers=num_workers,
+            shuffle=True,
+            return_list=True, )
+        self.rir_data = iter(self.rir_dataloader)
+    def forward(self, waveforms, lengths=None):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        lengths : tensor
+            Shape should be a single dimension, `[batch]`.
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+        """
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+        # Don't add reverb (return early) 1-`reverb_prob` portion of the time
+        if paddle.rand([1]) > self.reverb_prob:
+            return waveforms.clone()
+        # Add channels dimension if necessary
+        channel_added = False
+        if len(waveforms.shape) == 2:
+            waveforms = waveforms.unsqueeze(-1)
+            channel_added = True
+        # Load and prepare RIR
+        rir_waveform = self._load_rir()
+        # Compress or dilate RIR
+        if self.rir_scale_factor != 1:
+            rir_waveform = F.interpolate(
+                rir_waveform.transpose([0, 2, 1]),
+                scale_factor=self.rir_scale_factor,
+                mode="linear",
+                align_corners=False,
+                data_format='NCW', )
+            # (N, C, L) -> (N, L, C)
+            rir_waveform = rir_waveform.transpose([0, 2, 1])
+        rev_waveform = reverberate(
+            waveforms,
+            rir_waveform,
+            self.rir_dataset.sample_rate,
+            rescale_amp="avg")
+        # Remove channels dimension if added
+        if channel_added:
+            return rev_waveform.squeeze(-1)
+        return rev_waveform
+    def _load_rir(self):
+        try:
+            batch = next(self.rir_data)
+        except StopIteration:
+            self.rir_data = iter(self.rir_dataloader)
+            batch = next(self.rir_data)
+        rir_waveform = batch['feats']
+        # Make sure RIR has correct channels
+        if len(rir_waveform.shape) == 2:
+            rir_waveform = rir_waveform.unsqueeze(-1)
+        return rir_waveform
+class AddBabble(nn.Layer):
+    def __init__(
+            self,
+            speaker_count=3,
+            snr_low=0,
+            snr_high=0,
+            mix_prob=1, ):
+        super(AddBabble, self).__init__()
+        self.speaker_count = speaker_count
+        self.snr_low = snr_low
+        self.snr_high = snr_high
+        self.mix_prob = mix_prob
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+        babbled_waveform = waveforms.clone()
+        lengths = (lengths * waveforms.shape[1]).unsqueeze(1)
+        batch_size = len(waveforms)
+        # Don't mix (return early) 1-`mix_prob` portion of the batches
+        if paddle.rand([1]) > self.mix_prob:
+            return babbled_waveform
+        # Pick an SNR and use it to compute the mixture amplitude factors
+        clean_amplitude = compute_amplitude(waveforms, lengths)
+        SNR = paddle.rand((batch_size, 1))
+        SNR = SNR * (self.snr_high - self.snr_low) + self.snr_low
+        noise_amplitude_factor = 1 / (dB_to_amplitude(SNR) + 1)
+        new_noise_amplitude = noise_amplitude_factor * clean_amplitude
+        # Scale clean signal appropriately
+        babbled_waveform *= 1 - noise_amplitude_factor
+        # For each speaker in the mixture, roll and add
+        babble_waveform = waveforms.roll((1, ), axis=0)
+        babble_len = lengths.roll((1, ), axis=0)
+        for i in range(1, self.speaker_count):
+            babble_waveform += waveforms.roll((1 + i, ), axis=0)
+            babble_len = paddle.concat(
+                [babble_len, babble_len.roll((1, ), axis=0)], axis=-1).max(
+                    axis=-1, keepdim=True)
+        # Rescale and add to mixture
+        babble_amplitude = compute_amplitude(babble_waveform, babble_len)
+        babble_waveform *= new_noise_amplitude / (babble_amplitude + 1e-14)
+        babbled_waveform += babble_waveform
+        return babbled_waveform
+class TimeDomainSpecAugment(nn.Layer):
+    def __init__(
+            self,
+            perturb_prob=1.0,
+            drop_freq_prob=1.0,
+            drop_chunk_prob=1.0,
+            speeds=[95, 100, 105],
+            sample_rate=16000,
+            drop_freq_count_low=0,
+            drop_freq_count_high=3,
+            drop_chunk_count_low=0,
+            drop_chunk_count_high=5,
+            drop_chunk_length_low=1000,
+            drop_chunk_length_high=2000,
+            drop_chunk_noise_factor=0, ):
+        super(TimeDomainSpecAugment, self).__init__()
+        self.speed_perturb = SpeedPerturb(
+            perturb_prob=perturb_prob,
+            orig_freq=sample_rate,
+            speeds=speeds, )
+        self.drop_freq = DropFreq(
+            drop_prob=drop_freq_prob,
+            drop_count_low=drop_freq_count_low,
+            drop_count_high=drop_freq_count_high, )
+        self.drop_chunk = DropChunk(
+            drop_prob=drop_chunk_prob,
+            drop_count_low=drop_chunk_count_low,
+            drop_count_high=drop_chunk_count_high,
+            drop_length_low=drop_chunk_length_low,
+            drop_length_high=drop_chunk_length_high,
+            noise_factor=drop_chunk_noise_factor, )
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+        with paddle.no_grad():
+            # Augmentation
+            waveforms = self.speed_perturb(waveforms)
+            waveforms = self.drop_freq(waveforms)
+            waveforms = self.drop_chunk(waveforms, lengths)
+        return waveforms
+class EnvCorrupt(nn.Layer):
+    def __init__(
+            self,
+            reverb_prob=1.0,
+            babble_prob=1.0,
+            noise_prob=1.0,
+            rir_dataset=None,
+            noise_dataset=None,
+            num_workers=0,
+            babble_speaker_count=0,
+            babble_snr_low=0,
+            babble_snr_high=0,
+            noise_snr_low=0,
+            noise_snr_high=0,
+            rir_scale_factor=1.0, ):
+        super(EnvCorrupt, self).__init__()
+        # Initialize corrupters
+        if rir_dataset is not None and reverb_prob > 0.0:
+            self.add_reverb = AddReverb(
+                rir_dataset=rir_dataset,
+                num_workers=num_workers,
+                reverb_prob=reverb_prob,
+                rir_scale_factor=rir_scale_factor, )
+        if babble_speaker_count > 0 and babble_prob > 0.0:
+            self.add_babble = AddBabble(
+                speaker_count=babble_speaker_count,
+                snr_low=babble_snr_low,
+                snr_high=babble_snr_high,
+                mix_prob=babble_prob, )
+        if noise_dataset is not None and noise_prob > 0.0:
+            self.add_noise = AddNoise(
+                noise_dataset=noise_dataset,
+                num_workers=num_workers,
+                snr_low=noise_snr_low,
+                snr_high=noise_snr_high,
+                mix_prob=noise_prob, )
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+        # Augmentation
+        with paddle.no_grad():
+            if hasattr(self, "add_reverb"):
+                try:
+                    waveforms = self.add_reverb(waveforms, lengths)
+                except Exception:
+                    pass
+            if hasattr(self, "add_babble"):
+                waveforms = self.add_babble(waveforms, lengths)
+            if hasattr(self, "add_noise"):
+                waveforms = self.add_noise(waveforms, lengths)
+        return waveforms
+def build_augment_pipeline(target_dir=None) -> List[paddle.nn.Layer]:
+    """build augment pipeline
+    Note: this pipeline cannot be used in the paddle.DataLoader
+    Returns:
+        List[paddle.nn.Layer]: all augment process
+    """
+    logger.info("start to build the augment pipeline")
+    noise_dataset = OpenRIRNoise('noise', target_dir=target_dir)
+    rir_dataset = OpenRIRNoise('rir')
+    wavedrop = TimeDomainSpecAugment(
+        sample_rate=16000,
+        speeds=[100], )
+    speed_perturb = TimeDomainSpecAugment(
+        sample_rate=16000,
+        speeds=[95, 100, 105], )
+    add_noise = EnvCorrupt(
+        noise_dataset=noise_dataset,
+        reverb_prob=0.0,
+        noise_prob=1.0,
+        noise_snr_low=0,
+        noise_snr_high=15,
+        rir_scale_factor=1.0, )
+    add_rev = EnvCorrupt(
+        rir_dataset=rir_dataset,
+        reverb_prob=1.0,
+        noise_prob=0.0,
+        rir_scale_factor=1.0, )
+    add_rev_noise = EnvCorrupt(
+        noise_dataset=noise_dataset,
+        rir_dataset=rir_dataset,
+        reverb_prob=1.0,
+        noise_prob=1.0,
+        noise_snr_low=0,
+        noise_snr_high=15,
+        rir_scale_factor=1.0, )
+    return [wavedrop, speed_perturb, add_noise, add_rev, add_rev_noise]
+def waveform_augment(waveforms: paddle.Tensor,
+                     augment_pipeline: List[paddle.nn.Layer]) -> paddle.Tensor:
+    """process the augment pipeline and return all the waveforms
+    Args:
+        waveforms (paddle.Tensor): _description_
+        augment_pipeline (List[paddle.nn.Layer]): _description_
+    Returns:
+        paddle.Tensor: _description_
+    """
+    waveforms_aug_list = [waveforms]
+    for aug in augment_pipeline:
+        waveforms_aug = aug(waveforms)  # (N, L)
+        if waveforms_aug.shape[1] >= waveforms.shape[1]:
+            # Trunc
+            waveforms_aug = waveforms_aug[:, :waveforms.shape[1]]
+        else:
+            # Pad
+            lengths_to_pad = waveforms.shape[1] - waveforms_aug.shape[1]
+            waveforms_aug = F.pad(
+                waveforms_aug.unsqueeze(-1), [0, lengths_to_pad],
+                data_format='NLC').squeeze(-1)
+        waveforms_aug_list.append(waveforms_aug)
+    return paddle.concat(waveforms_aug_list, axis=0)
--- a/paddlespeech/vector/io/signal_processing.py
+++ b/paddlespeech/vector/io/signal_processing.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import numpy as np
+import paddle
+# TODO: Complete type-hint and doc string.
+def blackman_window(win_len, dtype=np.float32):
+    arcs = np.pi * np.arange(win_len) / float(win_len)
+    win = np.asarray(
+        [0.42 - 0.5 * np.cos(2 * arc) + 0.08 * np.cos(4 * arc) for arc in arcs],
+        dtype=dtype)
+    return paddle.to_tensor(win)
+def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
+    if len(waveforms.shape) == 1:
+        waveforms = waveforms.unsqueeze(0)
+    assert amp_type in ["avg", "peak"]
+    assert scale in ["linear", "dB"]
+    if amp_type == "avg":
+        if lengths is None:
+            out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
+        else:
+            wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
+            out = wav_sum / lengths
+    elif amp_type == "peak":
+        out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)
+    else:
+        raise NotImplementedError
+    if scale == "linear":
+        return out
+    elif scale == "dB":
+        return paddle.clip(20 * paddle.log10(out), min=-80)
+    else:
+        raise NotImplementedError
+def dB_to_amplitude(SNR):
+    return 10**(SNR / 20)
+def convolve1d(
+        waveform,
+        kernel,
+        padding=0,
+        pad_type="constant",
+        stride=1,
+        groups=1, ):
+    if len(waveform.shape) != 3:
+        raise ValueError("Convolve1D expects a 3-dimensional tensor")
+    # Padding can be a tuple (left_pad, right_pad) or an int
+    if isinstance(padding, list):
+        waveform = paddle.nn.functional.pad(
+            x=waveform,
+            pad=padding,
+            mode=pad_type,
+            data_format='NLC', )
+    # Move time dimension last, which pad and fft and conv expect.
+    # (N, L, C) -> (N, C, L)
+    waveform = waveform.transpose([0, 2, 1])
+    kernel = kernel.transpose([0, 2, 1])
+    convolved = paddle.nn.functional.conv1d(
+        x=waveform,
+        weight=kernel,
+        stride=stride,
+        groups=groups,
+        padding=padding if not isinstance(padding, list) else 0, )
+    # Return time dimension to the second dimension.
+    return convolved.transpose([0, 2, 1])
+def notch_filter(notch_freq, filter_width=101, notch_width=0.05):
+    # Check inputs
+    assert 0 < notch_freq <= 1
+    assert filter_width % 2 != 0
+    pad = filter_width // 2
+    inputs = paddle.arange(filter_width, dtype='float32') - pad
+    # Avoid frequencies that are too low
+    notch_freq += notch_width
+    # Define sinc function, avoiding division by zero
+    def sinc(x):
+        def _sinc(x):
+            return paddle.sin(x) / x
+        # The zero is at the middle index
+        res = paddle.concat(
+            [_sinc(x[:pad]), paddle.ones([1]), _sinc(x[pad + 1:])])
+        return res
+    # Compute a low-pass filter with cutoff frequency notch_freq.
+    hlpf = sinc(3 * (notch_freq - notch_width) * inputs)
+    # import torch
+    # hlpf *= paddle.to_tensor(torch.blackman_window(filter_width).detach().numpy())
+    hlpf *= blackman_window(filter_width)
+    hlpf /= paddle.sum(hlpf)
+    # Compute a high-pass filter with cutoff frequency notch_freq.
+    hhpf = sinc(3 * (notch_freq + notch_width) * inputs)
+    # hhpf *= paddle.to_tensor(torch.blackman_window(filter_width).detach().numpy())
+    hhpf *= blackman_window(filter_width)
+    hhpf /= -paddle.sum(hhpf)
+    hhpf[pad] += 1
+    # Adding filters creates notch filter
+    return (hlpf + hhpf).reshape([1, -1, 1])
+def reverberate(waveforms,
+                rir_waveform,
+                sample_rate,
+                impulse_duration=0.3,
+                rescale_amp="avg"):
+    orig_shape = waveforms.shape
+    if len(waveforms.shape) > 3 or len(rir_waveform.shape) > 3:
+        raise NotImplementedError
+    # if inputs are mono tensors we reshape to 1, samples
+    if len(waveforms.shape) == 1:
+        waveforms = waveforms.unsqueeze(0).unsqueeze(-1)
+    elif len(waveforms.shape) == 2:
+        waveforms = waveforms.unsqueeze(-1)
+    if len(rir_waveform.shape) == 1:  # convolve1d expects a 3d tensor !
+        rir_waveform = rir_waveform.unsqueeze(0).unsqueeze(-1)
+    elif len(rir_waveform.shape) == 2:
+        rir_waveform = rir_waveform.unsqueeze(-1)
+    # Compute the average amplitude of the clean
+    orig_amplitude = compute_amplitude(waveforms, waveforms.shape[1],
+                                       rescale_amp)
+    # Compute index of the direct signal, so we can preserve alignment
+    impulse_index_start = rir_waveform.abs().argmax(axis=1).item()
+    impulse_index_end = min(
+        impulse_index_start + int(sample_rate * impulse_duration),
+        rir_waveform.shape[1])
+    rir_waveform = rir_waveform[:, impulse_index_start:impulse_index_end, :]
+    rir_waveform = rir_waveform / paddle.norm(rir_waveform, p=2)
+    rir_waveform = paddle.flip(rir_waveform, [1])
+    waveforms = convolve1d(
+        waveform=waveforms,
+        kernel=rir_waveform,
+        padding=[rir_waveform.shape[1] - 1, 0], )
+    # Rescale to the peak amplitude of the clean waveform
+    waveforms = rescale(waveforms, waveforms.shape[1], orig_amplitude,
+                        rescale_amp)
+    if len(orig_shape) == 1:
+        waveforms = waveforms.squeeze(0).squeeze(-1)
+    if len(orig_shape) == 2:
+        waveforms = waveforms.squeeze(-1)
+    return waveforms
+def rescale(waveforms, lengths, target_lvl, amp_type="avg", scale="linear"):
+    assert amp_type in ["peak", "avg"]
+    assert scale in ["linear", "dB"]
+    batch_added = False
+    if len(waveforms.shape) == 1:
+        batch_added = True
+        waveforms = waveforms.unsqueeze(0)
+    waveforms = normalize(waveforms, lengths, amp_type)
+    if scale == "linear":
+        out = target_lvl * waveforms
+    elif scale == "dB":
+        out = dB_to_amplitude(target_lvl) * waveforms
+    else:
+        raise NotImplementedError("Invalid scale, choose between dB and linear")
+    if batch_added:
+        out = out.squeeze(0)
+    return out
+def normalize(waveforms, lengths=None, amp_type="avg", eps=1e-14):
+    assert amp_type in ["avg", "peak"]
+    batch_added = False
+    if len(waveforms.shape) == 1:
+        batch_added = True
+        waveforms = waveforms.unsqueeze(0)
+    den = compute_amplitude(waveforms, lengths, amp_type) + eps
+    if batch_added:
+        waveforms = waveforms.squeeze(0)
+    return waveforms / den
--- a/paddlespeech/vector/models/ecapa_tdnn.py
+++ b/paddlespeech/vector/models/ecapa_tdnn.py
@@ -19,6 +19,16 @@ import paddle.nn.functional as F
 def length_to_mask(length, max_len=None, dtype=None):
+    """_summary_
+    Args:
+        length (_type_): _description_
+        max_len (_type_, optional): _description_. Defaults to None.
+        dtype (_type_, optional): _description_. Defaults to None.
+    Returns:
+        _type_: _description_
+    """
    assert len(length.shape) == 1
    if max_len is None:
@@ -47,6 +57,19 @@ class Conv1d(nn.Layer):
            groups=1,
            bias=True,
            padding_mode="reflect", ):
+        """_summary_
+        Args:
+            in_channels (_type_): _description_
+            out_channels (_type_): _description_
+            kernel_size (_type_): _description_
+            stride (int, optional): _description_. Defaults to 1.
+            padding (str, optional): _description_. Defaults to "same".
+            dilation (int, optional): _description_. Defaults to 1.
+            groups (int, optional): _description_. Defaults to 1.
+            bias (bool, optional): _description_. Defaults to True.
+            padding_mode (str, optional): _description_. Defaults to "reflect".
+        """
        super().__init__()
        self.kernel_size = kernel_size
@@ -66,6 +89,17 @@ class Conv1d(nn.Layer):
            bias_attr=bias, )
    def forward(self, x):
+        """_summary_
+        Args:
+            x (_type_): _description_
+        Raises:
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
        if self.padding == "same":
            x = self._manage_padding(x, self.kernel_size, self.dilation,
                                     self.stride)
@@ -75,6 +109,17 @@ class Conv1d(nn.Layer):
        return self.conv(x)
    def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int):
+        """_summary_
+        Args:
+            x (_type_): _description_
+            kernel_size (int): _description_
+            dilation (int): _description_
+            stride (int): _description_
+        Returns:
+            _type_: _description_
+        """
        L_in = x.shape[-1]  # Detecting input shape
        padding = self._get_padding_elem(L_in, stride, kernel_size,
                                         dilation)  # Time padding
@@ -88,6 +133,17 @@ class Conv1d(nn.Layer):
                          stride: int,
                          kernel_size: int,
                          dilation: int):
+        """_summary_
+        Args:
+            L_in (int): _description_
+            stride (int): _description_
+            kernel_size (int): _description_
+            dilation (int): _description_
+        Returns:
+            _type_: _description_
+        """
        if stride > 1:
            n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
            L_out = stride * (n_steps - 1) + kernel_size * dilation
@@ -134,6 +190,15 @@ class TDNNBlock(nn.Layer):
            kernel_size,
            dilation,
            activation=nn.ReLU, ):
+        """Implementation of TDNN network
+        Args:
+            in_channels (int): input channels or input embedding dimensions
+            out_channels (int): output channels or output embedding dimensions
+            kernel_size (int): the kernel size of the TDNN network block
+            dilation (int): the dilation of the TDNN network block
+            activation (paddle class, optional): the activation layers. Defaults to nn.ReLU.
+        """
        super().__init__()
        self.conv = Conv1d(
            in_channels=in_channels,
@@ -149,6 +214,15 @@ class TDNNBlock(nn.Layer):
 class Res2NetBlock(nn.Layer):
    def __init__(self, in_channels, out_channels, scale=8, dilation=1):
+        """Implementation of Res2Net Block with dilation
+           The paper is refered as "Res2Net: A New Multi-scale Backbone Architecture",
+           whose url is https://arxiv.org/abs/1904.01169
+        Args:
+            in_channels (int): input channels or input dimensions
+            out_channels (int): output channels or output dimensions
+            scale (int, optional): _description_. Defaults to 8.
+            dilation (int, optional): _description_. Defaults to 1.
+        """
        super().__init__()
        assert in_channels % scale == 0
        assert out_channels % scale == 0
@@ -179,6 +253,14 @@ class Res2NetBlock(nn.Layer):
 class SEBlock(nn.Layer):
    def __init__(self, in_channels, se_channels, out_channels):
+        """Implementation of SEBlock
+           The paper is refered as "Squeeze-and-Excitation Networks"
+           whose url is https://arxiv.org/abs/1709.01507
+        Args:
+            in_channels (int): input channels or input data dimensions
+            se_channels (_type_): _description_
+            out_channels (int): output channels or output data dimensions
+        """
        super().__init__()
        self.conv1 = Conv1d(
@@ -275,6 +357,17 @@ class SERes2NetBlock(nn.Layer):
            kernel_size=1,
            dilation=1,
            activation=nn.ReLU, ):
+        """Implementation of Squeeze-Extraction Res2Blocks in ECAPA-TDNN network model
+        Args:
+            in_channels (int): input channels or input data dimensions
+            out_channels (_type_): _description_
+            res2net_scale (int, optional): _description_. Defaults to 8.
+            se_channels (int, optional): _description_. Defaults to 128.
+            kernel_size (int, optional): _description_. Defaults to 1.
+            dilation (int, optional): _description_. Defaults to 1.
+            activation (_type_, optional): _description_. Defaults to nn.ReLU.
+        """
        super().__init__()
        self.out_channels = out_channels
        self.tdnn1 = TDNNBlock(

--- a/paddlespeech/vector/training/seeding.py
+++ b/paddlespeech/vector/training/seeding.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.s2t.utils.log import Log
+logger = Log(__name__).getlog()
+import random
+import numpy as np
+import paddle
+def seed_everything(seed: int):
+    """Seed paddle, random and np.random to help reproductivity."""
+    paddle.seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    logger.info(f"Set the seed of paddle, random, np.random to {seed}.")