# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Contains feature normalizers.""" import random import numpy as np import paddle from paddle.io import DataLoader from paddle.io import Dataset from deepspeech.frontend.audio import AudioSegment from deepspeech.frontend.utility import load_cmvn from deepspeech.frontend.utility import read_manifest from deepspeech.utils.log import Log __all__ = ["FeatureNormalizer"] logger = Log(__name__).getlog() class CollateFunc(object): ''' Collate function for AudioDataset ''' def __init__(self): pass def __call__(self, batch): mean_stat = None var_stat = None number = 0 for feat in batch: sums = np.sum(feat, axis=1) if mean_stat is None: mean_stat = sums else: mean_stat += sums square_sums = np.sum(np.square(feat), axis=1) if var_stat is None: var_stat = square_sums else: var_stat += square_sums number += feat.shape[1] return paddle.to_tensor(number), paddle.to_tensor( mean_stat), paddle.to_tensor(var_stat) #return number, mean_stat, var_stat class AudioDataset(Dataset): def __init__(self, manifest_path, feature_func, num_samples=-1, rng=None): self.feature_func = feature_func self._rng = rng manifest = read_manifest(manifest_path) if num_samples == -1: sampled_manifest = manifest else: sampled_manifest = self._rng.sample(manifest, num_samples) self.items = sampled_manifest def __len__(self): return len(self.items) def __getitem__(self, idx): key = self.items[idx]['feat'] audioseg = AudioSegment.from_file(key) feat = self.feature_func(audioseg) #(D, T) return feat class FeatureNormalizer(object): """Feature normalizer. Normalize features to be of zero mean and unit stddev. if mean_std_filepath is provided (not None), the normalizer will directly initilize from the file. Otherwise, both manifest_path and featurize_func should be given for on-the-fly mean and stddev computing. :param mean_std_filepath: File containing the pre-computed mean and stddev. :type mean_std_filepath: None|str :param manifest_path: Manifest of instances for computing mean and stddev. :type meanifest_path: None|str :param featurize_func: Function to extract features. It should be callable with ``featurize_func(audio_segment)``. :type featurize_func: None|callable :param num_samples: Number of random samples for computing mean and stddev. :type num_samples: int :param random_seed: Random seed for sampling instances. :type random_seed: int :raises ValueError: If both mean_std_filepath and manifest_path (or both mean_std_filepath and featurize_func) are None. """ def __init__(self, mean_std_filepath, manifest_path=None, featurize_func=None, num_samples=500, num_workers=0, random_seed=0): if not mean_std_filepath: if not (manifest_path and featurize_func): raise ValueError("If mean_std_filepath is None, meanifest_path " "and featurize_func should not be None.") self._rng = random.Random(random_seed) self._compute_mean_std(manifest_path, featurize_func, num_samples, num_workers) else: self._read_mean_std_from_file(mean_std_filepath) def apply(self, features): """Normalize features to be of zero mean and unit stddev. :param features: Input features to be normalized. :type features: ndarray, shape (D, T) :param eps: added to stddev to provide numerical stablibity. :type eps: float :return: Normalized features. :rtype: ndarray """ return (features - self._mean) * self._istd def _read_mean_std_from_file(self, filepath, eps=1e-20): """Load mean and std from file.""" mean, istd = load_cmvn(filepath, filetype='json') self._mean = mean self._istd = istd def write_to_file(self, filepath): """Write the mean and stddev to the file. :param filepath: File to write mean and stddev. :type filepath: str """ with open(filepath, 'w') as fout: fout.write(json.dumps(self.cmvn_info)) def _compute_mean_std(self, manifest_path, featurize_func, num_samples, num_workers, eps=1e-20): """Compute mean and std from randomly sampled instances.""" # manifest = read_manifest(manifest_path) # if num_samples == -1: # sampled_manifest = manifest # else: # sampled_manifest = self._rng.sample(manifest, num_samples) # features = [] # for instance in sampled_manifest: # features.append( # featurize_func(AudioSegment.from_file(instance["feat"]))) # features = np.hstack(features) #(D, T) # self._mean = np.mean(features, axis=1) #(D,) # std = np.std(features, axis=1) #(D,) # std = np.clip(std, eps, None) # self._istd = 1.0 / std collate_func = CollateFunc() dataset = AudioDataset(manifest_path, featurize_func, num_samples) batch_size = 20 data_loader = DataLoader( dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_func) with paddle.no_grad(): all_mean_stat = None all_var_stat = None all_number = 0 wav_number = 0 for batch in data_loader(): number, mean_stat, var_stat = batch if all_mean_stat is None: all_mean_stat = mean_stat all_var_stat = var_stat else: all_mean_stat += mean_stat all_var_stat += var_stat all_number += number wav_number += batch_size if wav_number % 1000 == 0: logger.info('process {} wavs,{} frames'.format( wav_number, int(all_number))) self.cmvn_info = { 'mean_stat': list(all_mean_stat.tolist()), 'var_stat': list(all_var_stat.tolist()), 'frame_num': int(all_number), } return self.cmvn_info