From ee5a0c487f9d8fe930413ca36e8ab1d0f1e12b5b Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 16 Apr 2021 15:55:23 +0000 Subject: [PATCH] fix cmvn compute --- .notebook/u2_model.ipynb | 60 ++++++------ deepspeech/frontend/normalizer.py | 147 +++++++++++++++++++++++++----- examples/aishell/s0/local/data.sh | 1 + utils/compute_mean_std.py | 7 +- 4 files changed, 162 insertions(+), 53 deletions(-) diff --git a/.notebook/u2_model.ipynb b/.notebook/u2_model.ipynb index 7f17b921..c3ba4fd6 100644 --- a/.notebook/u2_model.ipynb +++ b/.notebook/u2_model.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "warming-contrast", + "id": "future-wesley", "metadata": {}, "outputs": [ { @@ -32,7 +32,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "genuine-marker", + "id": "eleven-istanbul", "metadata": {}, "outputs": [ { @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "accepting-genesis", + "id": "provincial-mexico", "metadata": {}, "outputs": [ { @@ -815,7 +815,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "baking-ozone", + "id": "choice-psychology", "metadata": {}, "outputs": [ { @@ -1528,7 +1528,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "committed-supplier", + "id": "enabling-botswana", "metadata": {}, "outputs": [ { @@ -1551,7 +1551,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "wooden-rugby", + "id": "acute-hunter", "metadata": {}, "outputs": [], "source": [ @@ -1566,7 +1566,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "streaming-queue", + "id": "impossible-mount", "metadata": {}, "outputs": [ { @@ -1662,7 +1662,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "cardiovascular-controversy", + "id": "dying-ideal", "metadata": {}, "outputs": [], "source": [ @@ -1741,7 +1741,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "sorted-nursery", + "id": "pleased-isaac", "metadata": {}, "outputs": [ { @@ -1777,7 +1777,7 @@ { "cell_type": "code", "execution_count": null, - "id": "typical-destruction", + "id": "appreciated-carpet", "metadata": {}, "outputs": [], "source": [] @@ -1785,7 +1785,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "junior-toner", + "id": "suitable-railway", "metadata": {}, "outputs": [ { @@ -1924,7 +1924,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "dense-brake", + "id": "afraid-translation", "metadata": {}, "outputs": [], "source": [ @@ -1934,7 +1934,7 @@ { "cell_type": "code", "execution_count": null, - "id": "voluntary-arcade", + "id": "answering-slide", "metadata": {}, "outputs": [], "source": [] @@ -1942,7 +1942,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "surprising-teach", + "id": "undefined-glenn", "metadata": {}, "outputs": [ { @@ -1972,7 +1972,7 @@ { "cell_type": "code", "execution_count": null, - "id": "permanent-loading", + "id": "twenty-funds", "metadata": {}, "outputs": [], "source": [] @@ -1980,7 +1980,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "criminal-setup", + "id": "threatened-phase", "metadata": {}, "outputs": [ { @@ -2003,7 +2003,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "brazilian-happening", + "id": "ordered-denver", "metadata": {}, "outputs": [ { @@ -2021,7 +2021,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "separate-eligibility", + "id": "above-investigator", "metadata": {}, "outputs": [ { @@ -2053,7 +2053,7 @@ { "cell_type": "code", "execution_count": 33, - "id": "alternate-comment", + "id": "dimensional-introduction", "metadata": {}, "outputs": [ { @@ -2216,7 +2216,7 @@ { "cell_type": "code", "execution_count": null, - "id": "polish-opportunity", + "id": "basic-basement", "metadata": {}, "outputs": [], "source": [] @@ -2224,7 +2224,7 @@ { "cell_type": "code", "execution_count": 17, - "id": "improved-alabama", + "id": "decreased-automation", "metadata": {}, "outputs": [ { @@ -2246,7 +2246,7 @@ { "cell_type": "code", "execution_count": 18, - "id": "metric-destruction", + "id": "marine-middle", "metadata": {}, "outputs": [], "source": [ @@ -2256,7 +2256,7 @@ { "cell_type": "code", "execution_count": 20, - "id": "turkish-watch", + "id": "young-reserve", "metadata": {}, "outputs": [], "source": [ @@ -2267,7 +2267,7 @@ { "cell_type": "code", "execution_count": 47, - "id": "drawn-crash", + "id": "differential-mileage", "metadata": {}, "outputs": [], "source": [ @@ -2328,7 +2328,7 @@ { "cell_type": "code", "execution_count": 48, - "id": "informative-optics", + "id": "industrial-server", "metadata": {}, "outputs": [ { @@ -2385,7 +2385,7 @@ { "cell_type": "code", "execution_count": 49, - "id": "northern-advisory", + "id": "noticed-soviet", "metadata": {}, "outputs": [ { @@ -2416,7 +2416,7 @@ { "cell_type": "code", "execution_count": 50, - "id": "prospective-death", + "id": "clinical-matter", "metadata": {}, "outputs": [ { @@ -2455,7 +2455,7 @@ { "cell_type": "code", "execution_count": 51, - "id": "closed-partner", + "id": "checked-picking", "metadata": {}, "outputs": [], "source": [] @@ -2463,7 +2463,7 @@ { "cell_type": "code", "execution_count": 52, - "id": "silent-animal", + "id": "normal-airfare", "metadata": {}, "outputs": [ { @@ -2483,7 +2483,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fatal-board", + "id": "fewer-drill", "metadata": {}, "outputs": [], "source": [] diff --git a/deepspeech/frontend/normalizer.py b/deepspeech/frontend/normalizer.py index 53ae26b8..f681b289 100644 --- a/deepspeech/frontend/normalizer.py +++ b/deepspeech/frontend/normalizer.py @@ -15,10 +15,69 @@ import random import numpy as np +import paddle +from paddle.io import DataLoader +from paddle.io import Dataset from deepspeech.frontend.audio import AudioSegment from deepspeech.frontend.utility import load_cmvn from deepspeech.frontend.utility import read_manifest +from deepspeech.utils.log import Log + +__all__ = ["FeatureNormalizer"] + +logger = Log(__name__).getlog() + + +class CollateFunc(object): + ''' Collate function for AudioDataset + ''' + + def __init__(self): + pass + + def __call__(self, batch): + mean_stat = None + var_stat = None + number = 0 + for feat in batch: + sums = np.sum(feat, axis=1) + if mean_stat is None: + mean_stat = sums + else: + mean_stat += sums + + square_sums = np.sum(np.square(feat), axis=1) + if var_stat is None: + var_stat = square_sums + else: + var_stat += square_sums + + number += feat.shape[1] + return paddle.to_tensor(number), paddle.to_tensor( + mean_stat), paddle.to_tensor(var_stat) + #return number, mean_stat, var_stat + + +class AudioDataset(Dataset): + def __init__(self, manifest_path, feature_func, num_samples=-1, rng=None): + self.feature_func = feature_func + self._rng = rng + manifest = read_manifest(manifest_path) + if num_samples == -1: + sampled_manifest = manifest + else: + sampled_manifest = self._rng.sample(manifest, num_samples) + self.items = sampled_manifest + + def __len__(self): + return len(self.items) + + def __getitem__(self, idx): + key = self.items[idx]['feat'] + audioseg = AudioSegment.from_file(key) + feat = self.feature_func(audioseg) #(D, T) + return feat class FeatureNormalizer(object): @@ -49,13 +108,15 @@ class FeatureNormalizer(object): manifest_path=None, featurize_func=None, num_samples=500, + num_workers=0, random_seed=0): if not mean_std_filepath: if not (manifest_path and featurize_func): raise ValueError("If mean_std_filepath is None, meanifest_path " "and featurize_func should not be None.") self._rng = random.Random(random_seed) - self._compute_mean_std(manifest_path, featurize_func, num_samples) + self._compute_mean_std(manifest_path, featurize_func, num_samples, + num_workers) else: self._read_mean_std_from_file(mean_std_filepath) @@ -71,37 +132,79 @@ class FeatureNormalizer(object): """ return (features - self._mean) * self._istd + def _read_mean_std_from_file(self, filepath, eps=1e-20): + """Load mean and std from file.""" + mean, istd = load_cmvn(filepath, filetype='json') + self._mean = mean + self._istd = istd + def write_to_file(self, filepath): """Write the mean and stddev to the file. :param filepath: File to write mean and stddev. :type filepath: str """ - np.savez(filepath, mean=self._mean, istd=self._istd) - - def _read_mean_std_from_file(self, filepath, eps=1e-20): - """Load mean and std from file.""" - mean, istd = load_cmvn(filepath, filetype='npz') - self._mean = mean.T - self._istd = istd.T + with open(filepath, 'w') as fout: + fout.write(json.dumps(self.cmvn_info)) def _compute_mean_std(self, manifest_path, featurize_func, num_samples, + num_workers, eps=1e-20): """Compute mean and std from randomly sampled instances.""" - manifest = read_manifest(manifest_path) - if num_samples == -1: - sampled_manifest = manifest - else: - sampled_manifest = self._rng.sample(manifest, num_samples) - features = [] - for instance in sampled_manifest: - features.append( - featurize_func(AudioSegment.from_file(instance["feat"]))) - features = np.hstack(features) #(D, T) - self._mean = np.mean(features, axis=1) #(D,) - std = np.std(features, axis=1) #(D,) - std = np.clip(std, eps, None) - self._istd = 1.0 / std + # manifest = read_manifest(manifest_path) + # if num_samples == -1: + # sampled_manifest = manifest + # else: + # sampled_manifest = self._rng.sample(manifest, num_samples) + # features = [] + # for instance in sampled_manifest: + # features.append( + # featurize_func(AudioSegment.from_file(instance["feat"]))) + # features = np.hstack(features) #(D, T) + # self._mean = np.mean(features, axis=1) #(D,) + # std = np.std(features, axis=1) #(D,) + # std = np.clip(std, eps, None) + # self._istd = 1.0 / std + + collate_func = CollateFunc() + + dataset = AudioDataset(manifest_path, featurize_func, num_samples) + + batch_size = 20 + data_loader = DataLoader( + dataset, + batch_size=batch_size, + shuffle=False, + num_workers=num_workers, + collate_fn=collate_func) + + with paddle.no_grad(): + all_mean_stat = None + all_var_stat = None + all_number = 0 + wav_number = 0 + for batch in data_loader(): + number, mean_stat, var_stat = batch + if all_mean_stat is None: + all_mean_stat = mean_stat + all_var_stat = var_stat + else: + all_mean_stat += mean_stat + all_var_stat += var_stat + all_number += number + wav_number += batch_size + + if wav_number % 1000 == 0: + logger.info('process {} wavs,{} frames'.format( + wav_number, int(all_number))) + + self.cmvn_info = { + 'mean_stat': list(all_mean_stat.tolist()), + 'var_stat': list(all_var_stat.tolist()), + 'frame_num': int(all_number), + } + + return self.cmvn_info diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh index f98e5a85..5af19e75 100644 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/s0/local/data.sh @@ -42,6 +42,7 @@ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --stride_ms=10.0 \ --window_ms=25.0 \ --sample_rate=16000 \ +--num_workers=0 \ --output_path="data/mean_std.npz" if [ $? -ne 0 ]; then diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py index 780568f9..682a5387 100644 --- a/utils/compute_mean_std.py +++ b/utils/compute_mean_std.py @@ -39,6 +39,10 @@ add_arg('sample_rate', int, 16000, "target sample rate.") add_arg('manifest_path', str, 'data/librispeech/manifest.train', "Filepath of manifest to compute normalizer's mean and stddev.") +add_arg('num_workers', + default=0, + type=int, + help='num of subprocess workers for processing') add_arg('output_path', str, 'data/librispeech/mean_std.npz', "Filepath of write mean and stddev to (.npz).") @@ -70,7 +74,8 @@ def main(): mean_std_filepath=None, manifest_path=args.manifest_path, featurize_func=augment_and_featurize, - num_samples=args.num_samples) + num_samples=args.num_samples, + num_workers=args.num_workers) normalizer.write_to_file(args.output_path) -- GitLab