diff --git a/examples/librispeech/s1/conf/preprocess.yaml b/examples/librispeech/s1/conf/preprocess.yaml index 97ebf41de2bb106ae667825592e398046f643893..021ca4c58022f696c5218dedfe20e7244f09bd7f 100644 --- a/examples/librispeech/s1/conf/preprocess.yaml +++ b/examples/librispeech/s1/conf/preprocess.yaml @@ -10,16 +10,16 @@ process: cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument - type: time_warp - max_time_warp: 0 + max_time_warp: 5 inplace: true mode: PIL - type: freq_mask - F: 10 + F: 30 n_mask: 2 inplace: true - replace_with_zero: true + replace_with_zero: false - type: time_mask - T: 50 + T: 40 n_mask: 2 inplace: true - replace_with_zero: true + replace_with_zero: false diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py index 4171f85bb2b12e33a02f27a3b5277ed5ccbc979e..65dccad385c07ee59edf585a603ab822d8d38607 100644 --- a/paddlespeech/s2t/frontend/audio.py +++ b/paddlespeech/s2t/frontend/audio.py @@ -24,9 +24,9 @@ import soundfile import soxbindings as sox from scipy import signal -from .utility import subfile_from_tar -from .utility import convert_samples_to_float32 from .utility import convert_samples_from_float32 +from .utility import convert_samples_to_float32 +from .utility import subfile_from_tar class AudioSegment(): diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py index 58e5b1b0ce68e93c378dd9a2bc676e498c18b646..703f2127d7e71b093030658f6d88c45979770c61 100644 --- a/paddlespeech/s2t/frontend/utility.py +++ b/paddlespeech/s2t/frontend/utility.py @@ -390,4 +390,3 @@ def convert_samples_from_float32(samples, dtype): else: raise TypeError("Unsupported sample type: %s." % samples.dtype) return output_samples.astype(dtype) - diff --git a/paddlespeech/s2t/transform/spec_augment.py b/paddlespeech/s2t/transform/spec_augment.py index 83e4e2e7502390dd2610c15923eabe0be694b802..5ce950851a4ee6dbaa2bcbe529cbc89ce714a60b 100644 --- a/paddlespeech/s2t/transform/spec_augment.py +++ b/paddlespeech/s2t/transform/spec_augment.py @@ -34,6 +34,9 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"): :returns numpy.ndarray: time warped spectrogram (time, freq) """ window = max_time_warp + if window == 0: + return x + if mode == "PIL": t = x.shape[0] if t - window <= window: diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index 9e576d0df39371749996d53b095ff9c4cb6f851b..da91ef92174b817bd8778b6a53518eed2b9e6f1b 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -307,9 +307,6 @@ class IStft(): center=self.center, ) -from paddlespeech.s2t.utils.log import Log -logger = Log(__name__).getlog() - class LogMelSpectrogramKaldi(): def __init__( self, @@ -347,22 +344,22 @@ class LogMelSpectrogramKaldi(): self.dither = dither def __repr__(self): - return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " - "n_shift={n_shift}, win_length={win_length}, window={window}, " - "fmin={fmin}, fmax={fmax}, eps={eps}, preemph={preemph}, window={window}, dither={dither}))".format( - name=self.__class__.__name__, - fs=self.fs, - n_mels=self.n_mels, - n_fft=self.n_fft, - n_shift=self.n_shift, - win_length=self.win_length, - window=self.window, - fmin=self.fmin, - fmax=self.fmax, - eps=self.eps, - preemph=self.preemph, - window=self.window, - dither=self.dither)) + return ( + "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " + "n_shift={n_shift}, win_length={win_length}, preemph={preemph}, window={window}, " + "fmin={fmin}, fmax={fmax}, eps={eps}, dither={dither}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_fft=self.n_fft, + n_shift=self.n_shift, + preemph=self.preemph, + win_length=self.win_length, + window=self.window, + fmin=self.fmin, + fmax=self.fmax, + eps=self.eps, + dither=self.dither, )) def __call__(self, x): """ @@ -379,12 +376,10 @@ class LogMelSpectrogramKaldi(): if x.ndim != 1: raise ValueError("Not support x: [Time, Channel]") - logger.info(f"in {x}") if x.dtype in np.sctypes['float']: # PCM32 -> PCM16 bits = np.iinfo(np.int16).bits x = x * 2**(bits - 1) - logger.info(f"b {x}") # logfbank need PCM16 input y = logfbank( @@ -400,7 +395,4 @@ class LogMelSpectrogramKaldi(): remove_dc_offset=self.remove_dc_offset, preemph=self.preemph, wintype=self.window) - logger.info(f"a {y}") - - return y