fix logfbank using PCM16

8b0e344c · Hui Zhang · d62092ac · 8b0e344c · 8b0e344c · 8b0e344c
4 changed file
--- a/examples/librispeech/s1/conf/preprocess.yaml
+++ b/examples/librispeech/s1/conf/preprocess.yaml
@@ -23,7 +23,3 @@ process:
    n_mask: 2
    inplace: true
    replace_with_zero: true
-
-
-
-
--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
@@ -25,6 +25,8 @@ import soxbindings as sox
 from scipy import signal

 from .utility import subfile_from_tar
+from .utility import convert_samples_to_float32
+from .utility import convert_samples_from_float32


 class AudioSegment():
@@ -689,15 +691,7 @@ class AudioSegment():
        Audio sample type is usually integer or float-point.
        Integers will be scaled to [-1, 1] in float32.
        """
-        float32_samples = samples.astype('float32')
-        if samples.dtype in np.sctypes['int']:
-            bits = np.iinfo(samples.dtype).bits
-            float32_samples *= (1. / 2**(bits - 1))
-        elif samples.dtype in np.sctypes['float']:
-            pass
-        else:
-            raise TypeError("Unsupported sample type: %s." % samples.dtype)
-        return float32_samples
+        return convert_samples_to_float32(samples)

    def _convert_samples_from_float32(self, samples, dtype):
        """Convert sample type from float32 to dtype.
@@ -708,20 +702,4 @@ class AudioSegment():

        This is for writing a audio file.
        """
-        dtype = np.dtype(dtype)
-        output_samples = samples.copy()
-        if dtype in np.sctypes['int']:
-            bits = np.iinfo(dtype).bits
-            output_samples *= (2**(bits - 1) / 1.)
-            min_val = np.iinfo(dtype).min
-            max_val = np.iinfo(dtype).max
-            output_samples[output_samples > max_val] = max_val
-            output_samples[output_samples < min_val] = min_val
-        elif samples.dtype in np.sctypes['float']:
-            min_val = np.finfo(dtype).min
-            max_val = np.finfo(dtype).max
-            output_samples[output_samples > max_val] = max_val
-            output_samples[output_samples < min_val] = min_val
-        else:
-            raise TypeError("Unsupported sample type: %s." % samples.dtype)
-        return output_samples.astype(dtype)
+        return convert_samples_from_float32(samples, dtype)
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -30,7 +30,8 @@ logger = Log(__name__).getlog()
 __all__ = [
    "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
    "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
-    "EOS", "UNK", "BLANK", "MASKCTC", "SPACE"
+    "EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32",
+    "convert_samples_from_float32"
 ]

 IGNORE_ID = -1
@@ -342,3 +343,51 @@ def load_cmvn(cmvn_file: str, filetype: str):
    else:
        raise ValueError(f"cmvn file type no support: {filetype}")
    return cmvn[0], cmvn[1]
+
+
+def convert_samples_to_float32(samples):
+    """Convert sample type to float32.
+
+    Audio sample type is usually integer or float-point.
+    Integers will be scaled to [-1, 1] in float32.
+
+    PCM16 -> PCM32
+    """
+    float32_samples = samples.astype('float32')
+    if samples.dtype in np.sctypes['int']:
+        bits = np.iinfo(samples.dtype).bits
+        float32_samples *= (1. / 2**(bits - 1))
+    elif samples.dtype in np.sctypes['float']:
+        pass
+    else:
+        raise TypeError("Unsupported sample type: %s." % samples.dtype)
+    return float32_samples
+
+
+def convert_samples_from_float32(samples, dtype):
+    """Convert sample type from float32 to dtype.
+
+    Audio sample type is usually integer or float-point. For integer
+    type, float32 will be rescaled from [-1, 1] to the maximum range
+    supported by the integer type.
+
+    PCM32 -> PCM16
+    """
+    dtype = np.dtype(dtype)
+    output_samples = samples.copy()
+    if dtype in np.sctypes['int']:
+        bits = np.iinfo(dtype).bits
+        output_samples *= (2**(bits - 1) / 1.)
+        min_val = np.iinfo(dtype).min
+        max_val = np.iinfo(dtype).max
+        output_samples[output_samples > max_val] = max_val
+        output_samples[output_samples < min_val] = min_val
+    elif samples.dtype in np.sctypes['float']:
+        min_val = np.finfo(dtype).min
+        max_val = np.finfo(dtype).max
+        output_samples[output_samples > max_val] = max_val
+        output_samples[output_samples < min_val] = min_val
+    else:
+        raise TypeError("Unsupported sample type: %s." % samples.dtype)
+    return output_samples.astype(dtype)
+
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -307,6 +307,9 @@ class IStft():
            center=self.center, )


+from paddlespeech.s2t.utils.log import Log
+logger = Log(__name__).getlog()
+
 class LogMelSpectrogramKaldi():
    def __init__(
            self,
@@ -346,7 +349,7 @@ class LogMelSpectrogramKaldi():
    def __repr__(self):
        return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
                "n_shift={n_shift}, win_length={win_length}, window={window}, "
-                "fmin={fmin}, fmax={fmax}, eps={eps}))".format(
+                "fmin={fmin}, fmax={fmax}, eps={eps}, preemph={preemph}, window={window}, dither={dither}))".format(
                    name=self.__class__.__name__,
                    fs=self.fs,
                    n_mels=self.n_mels,
@@ -356,7 +359,10 @@ class LogMelSpectrogramKaldi():
                    window=self.window,
                    fmin=self.fmin,
                    fmax=self.fmax,
-                    eps=self.eps, ))
+                    eps=self.eps, 
+                    preemph=self.preemph,
+                    window=self.window,
+                    dither=self.dither))

    def __call__(self, x):
        """
@@ -372,9 +378,16 @@ class LogMelSpectrogramKaldi():
        """
        if x.ndim != 1:
            raise ValueError("Not support x: [Time, Channel]")
-        if x.dtype == np.int16:
-            x = x / 2**(16 - 1)
-        return logfbank(
+
+        logger.info(f"in {x}")
+        if x.dtype in np.sctypes['float']:
+            # PCM32 -> PCM16
+            bits = np.iinfo(np.int16).bits
+            x = x * 2**(bits - 1)
+        logger.info(f"b {x}")
+
+        # logfbank need PCM16 input
+        y = logfbank(
            signal=x,
            samplerate=self.fs,
            winlen=self.win_length,  # unit ms
@@ -387,3 +400,7 @@ class LogMelSpectrogramKaldi():
            remove_dc_offset=self.remove_dc_offset,
            preemph=self.preemph,
            wintype=self.window)
+        logger.info(f"a {y}")
+
+
+        return y