fix cmvn

1635e000 · Hui Zhang · 2aed2752 · 1635e000 · 1635e000 · 1635e000
7 changed file
--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@@ -22,9 +22,12 @@ from paddle.io import Dataset
 from deepspeech.frontend.audio import AudioSegment
 from deepspeech.frontend.utility import load_cmvn
 from deepspeech.frontend.utility import read_manifest
+from deepspeech.utils.log import Log

 __all__ = ["FeatureNormalizer"]

+logger = Log(__name__).getlog()
+

 # https://github.com/PaddlePaddle/Paddle/pull/31481
 class CollateFunc(object):
@@ -176,7 +179,7 @@ class FeatureNormalizer(object):
                wav_number += batch_size

                if wav_number % 1000 == 0:
-                    print('process {} wavs,{} frames'.format(wav_number,
+                    logger.info('process {} wavs,{} frames'.format(wav_number,
                                                                   all_number))

        self.cmvn_info = {

--- a/deepspeech/utils/log.py
+++ b/deepspeech/utils/log.py
@@ -17,6 +17,12 @@ import os
 import socket
 import sys

+FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
+DATE_FMT_STR = '%Y/%m/%d %H:%M:%S'
+
+logging.basicConfig(
+    level=logging.DEBUG, format=FORMAT_STR, datefmt=DATE_FMT_STR)
+

 def find_log_dir(log_dir=None):
    """Returns the most suitable directory to put log files into.
@@ -123,12 +129,10 @@ class Log():
            pass

        if not self.logger.hasHandlers():
-            format = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
-            formatter = logging.Formatter(
-                fmt=format, datefmt='%Y/%m/%d %H:%M:%S')
+            formatter = logging.Formatter(fmt=FORMAT_STR, datefmt=DATE_FMT_STR)
            fh = logging.FileHandler(Log.log_name)
-            fh.setFormatter(formatter)
            fh.setLevel(logging.DEBUG)
+            fh.setFormatter(formatter)
            self.logger.addHandler(fh)

            ch = logging.StreamHandler()
@@ -136,9 +140,6 @@ class Log():
            ch.setFormatter(formatter)
            self.logger.addHandler(ch)

-            #fh.close()
-            #ch.close()
-
        # stop propagate for propagating may print
        # log multiple times
        self.logger.propagate = False

--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@@ -51,6 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    --stride_ms=10.0 \
    --window_ms=25.0 \
    --sample_rate=16000 \
+    --use_dB_normalization=False \
    --num_samples=-1 \
    --num_workers=16 \
    --output_path="data/mean_std.json"

--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@@ -73,6 +73,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    --sample_rate=16000 \
    --stride_ms=10.0 \
    --window_ms=25.0 \
+    --use_dB_normalization=False \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"


--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@@ -57,6 +57,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    --sample_rate=16000 \
    --stride_ms=10.0 \
    --window_ms=25.0 \
+    --use_dB_normalization=False \
    --num_workers=2 \
    --output_path="data/mean_std.json"
    

--- a/utils/avg_model.py
+++ b/utils/avg_model.py
@@ -21,6 +21,8 @@ import paddle


 def main(args):
+    paddle.set_device('cpu')
+
    val_scores = []
    beat_val_scores = []
    selected_epochs = []

--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -25,17 +25,19 @@ parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('num_samples',      int,    -1,    "# of samples to for statistics.")
+
 add_arg('specgram_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc, fbank.",
        choices=['linear', 'mfcc', 'fbank'])
 add_arg('feat_dim',    int, 13, "Audio feature dim.")
-add_arg('delta_delta',    bool,
-        False,
-        "Audio feature with delta delta.")
+add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
 add_arg('stride_ms', float, 10.0,  "stride length in ms.")
 add_arg('window_ms', float, 20.0,  "stride length in ms.")
 add_arg('sample_rate',  int, 16000,  "target sample rate.")
+add_arg('use_dB_normalization', bool, False, "do dB normalization.")
+add_arg('target_dB',   int, -20,  "target dB.")
+
 add_arg('manifest_path',    str,
        'data/librispeech/manifest.train',
        "Filepath of manifest to compute normalizer's mean and stddev.")
@@ -63,8 +65,8 @@ def main():
        n_fft=None,
        max_freq=None,
        target_sample_rate=args.sample_rate,
-        use_dB_normalization=True,
-        target_dB=-20,
+        use_dB_normalization=args.use_dB_normalization,
+        target_dB=args.target_dB,
        dither=0.0)

    def augment_and_featurize(audio_segment):