From 8b0e344c6983a28057bfd60e32cc6ef9af91c584 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 8 Nov 2021 06:53:22 +0000
Subject: [PATCH] fix logfbank using PCM16

---
 examples/librispeech/s1/conf/preprocess.yaml |  4 --
 paddlespeech/s2t/frontend/audio.py           | 30 ++----------
 paddlespeech/s2t/frontend/utility.py         | 51 +++++++++++++++++++-
 paddlespeech/s2t/transform/spectrogram.py    | 27 +++++++++--
 4 files changed, 76 insertions(+), 36 deletions(-)

diff --git a/examples/librispeech/s1/conf/preprocess.yaml b/examples/librispeech/s1/conf/preprocess.yaml
index bcbc7ad2..97ebf41d 100644
--- a/examples/librispeech/s1/conf/preprocess.yaml
+++ b/examples/librispeech/s1/conf/preprocess.yaml
@@ -23,7 +23,3 @@ process:
     n_mask: 2
     inplace: true
     replace_with_zero: true
-
-
-
-
diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py
index 13dc3a44..4171f85b 100644
--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
@@ -25,6 +25,8 @@ import soxbindings as sox
 from scipy import signal
 
 from .utility import subfile_from_tar
+from .utility import convert_samples_to_float32
+from .utility import convert_samples_from_float32
 
 
 class AudioSegment():
@@ -689,15 +691,7 @@ class AudioSegment():
         Audio sample type is usually integer or float-point.
         Integers will be scaled to [-1, 1] in float32.
         """
-        float32_samples = samples.astype('float32')
-        if samples.dtype in np.sctypes['int']:
-            bits = np.iinfo(samples.dtype).bits
-            float32_samples *= (1. / 2**(bits - 1))
-        elif samples.dtype in np.sctypes['float']:
-            pass
-        else:
-            raise TypeError("Unsupported sample type: %s." % samples.dtype)
-        return float32_samples
+        return convert_samples_to_float32(samples)
 
     def _convert_samples_from_float32(self, samples, dtype):
         """Convert sample type from float32 to dtype.
@@ -708,20 +702,4 @@ class AudioSegment():
 
         This is for writing a audio file.
         """
-        dtype = np.dtype(dtype)
-        output_samples = samples.copy()
-        if dtype in np.sctypes['int']:
-            bits = np.iinfo(dtype).bits
-            output_samples *= (2**(bits - 1) / 1.)
-            min_val = np.iinfo(dtype).min
-            max_val = np.iinfo(dtype).max
-            output_samples[output_samples > max_val] = max_val
-            output_samples[output_samples < min_val] = min_val
-        elif samples.dtype in np.sctypes['float']:
-            min_val = np.finfo(dtype).min
-            max_val = np.finfo(dtype).max
-            output_samples[output_samples > max_val] = max_val
-            output_samples[output_samples < min_val] = min_val
-        else:
-            raise TypeError("Unsupported sample type: %s." % samples.dtype)
-        return output_samples.astype(dtype)
+        return convert_samples_from_float32(samples, dtype)
diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
index 089890d2..58e5b1b0 100644
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -30,7 +30,8 @@ logger = Log(__name__).getlog()
 __all__ = [
     "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
     "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
-    "EOS", "UNK", "BLANK", "MASKCTC", "SPACE"
+    "EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32",
+    "convert_samples_from_float32"
 ]
 
 IGNORE_ID = -1
@@ -342,3 +343,51 @@ def load_cmvn(cmvn_file: str, filetype: str):
     else:
         raise ValueError(f"cmvn file type no support: {filetype}")
     return cmvn[0], cmvn[1]
+
+
+def convert_samples_to_float32(samples):
+    """Convert sample type to float32.
+
+    Audio sample type is usually integer or float-point.
+    Integers will be scaled to [-1, 1] in float32.
+
+    PCM16 -> PCM32
+    """
+    float32_samples = samples.astype('float32')
+    if samples.dtype in np.sctypes['int']:
+        bits = np.iinfo(samples.dtype).bits
+        float32_samples *= (1. / 2**(bits - 1))
+    elif samples.dtype in np.sctypes['float']:
+        pass
+    else:
+        raise TypeError("Unsupported sample type: %s." % samples.dtype)
+    return float32_samples
+
+
+def convert_samples_from_float32(samples, dtype):
+    """Convert sample type from float32 to dtype.
+
+    Audio sample type is usually integer or float-point. For integer
+    type, float32 will be rescaled from [-1, 1] to the maximum range
+    supported by the integer type.
+
+    PCM32 -> PCM16
+    """
+    dtype = np.dtype(dtype)
+    output_samples = samples.copy()
+    if dtype in np.sctypes['int']:
+        bits = np.iinfo(dtype).bits
+        output_samples *= (2**(bits - 1) / 1.)
+        min_val = np.iinfo(dtype).min
+        max_val = np.iinfo(dtype).max
+        output_samples[output_samples > max_val] = max_val
+        output_samples[output_samples < min_val] = min_val
+    elif samples.dtype in np.sctypes['float']:
+        min_val = np.finfo(dtype).min
+        max_val = np.finfo(dtype).max
+        output_samples[output_samples > max_val] = max_val
+        output_samples[output_samples < min_val] = min_val
+    else:
+        raise TypeError("Unsupported sample type: %s." % samples.dtype)
+    return output_samples.astype(dtype)
+
diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index 6956b908..9e576d0d 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -307,6 +307,9 @@ class IStft():
             center=self.center, )
 
 
+from paddlespeech.s2t.utils.log import Log
+logger = Log(__name__).getlog()
+
 class LogMelSpectrogramKaldi():
     def __init__(
             self,
@@ -346,7 +349,7 @@ class LogMelSpectrogramKaldi():
     def __repr__(self):
         return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
                 "n_shift={n_shift}, win_length={win_length}, window={window}, "
-                "fmin={fmin}, fmax={fmax}, eps={eps}))".format(
+                "fmin={fmin}, fmax={fmax}, eps={eps}, preemph={preemph}, window={window}, dither={dither}))".format(
                     name=self.__class__.__name__,
                     fs=self.fs,
                     n_mels=self.n_mels,
@@ -356,7 +359,10 @@ class LogMelSpectrogramKaldi():
                     window=self.window,
                     fmin=self.fmin,
                     fmax=self.fmax,
-                    eps=self.eps, ))
+                    eps=self.eps, 
+                    preemph=self.preemph,
+                    window=self.window,
+                    dither=self.dither))
 
     def __call__(self, x):
         """
@@ -372,9 +378,16 @@ class LogMelSpectrogramKaldi():
         """
         if x.ndim != 1:
             raise ValueError("Not support x: [Time, Channel]")
-        if x.dtype == np.int16:
-            x = x / 2**(16 - 1)
-        return logfbank(
+
+        logger.info(f"in {x}")
+        if x.dtype in np.sctypes['float']:
+            # PCM32 -> PCM16
+            bits = np.iinfo(np.int16).bits
+            x = x * 2**(bits - 1)
+        logger.info(f"b {x}")
+
+        # logfbank need PCM16 input
+        y = logfbank(
             signal=x,
             samplerate=self.fs,
             winlen=self.win_length,  # unit ms
@@ -387,3 +400,7 @@ class LogMelSpectrogramKaldi():
             remove_dc_offset=self.remove_dc_offset,
             preemph=self.preemph,
             wintype=self.window)
+        logger.info(f"a {y}")
+
+
+        return y
-- 
GitLab