fix the bug of spec shape

82ca0f65 · Haoxin Ma · 043127b6 · 82ca0f65 · 82ca0f65 · 82ca0f65
4 changed file
--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@@ -175,6 +175,7 @@ class AudioFeaturizer(object):
                                 max_freq=None,
                                 eps=1e-14):
        """Compute the linear spectrogram from FFT energy."""
+        # return T,D
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
@@ -190,8 +191,12 @@ class AudioFeaturizer(object):
            window_size=window_size,
            stride_size=stride_size,
            sample_rate=sample_rate)
        ind = np.where(freqs <= max_freq)[0][-1] + 1
-        return np.log(specgram[:ind, :] + eps)
+        specgram = np.log(specgram[:ind, :] + eps)
+        specgram = np.transpose(specgram) #T,D
+        return specgram
    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
        """Compute the spectrogram for samples from a real signal."""
@@ -294,6 +299,7 @@ class AudioFeaturizer(object):
            ceplifter=22,
            useEnergy=True,
            winfunc='povey')
        mfcc_feat = np.transpose(mfcc_feat)
        if delta_delta:
            mfcc_feat = self._concat_delta_delta(mfcc_feat)

--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@@ -131,8 +131,8 @@ class FeatureNormalizer(object):
    def _read_mean_std_from_file(self, filepath, eps=1e-20):
        """Load mean and std from file."""
        mean, istd = load_cmvn(filepath, filetype='json')
-        self._mean = np.expand_dims(mean, axis=-1)
+        self._mean = np.expand_dims(mean, axis=0)
-        self._istd = np.expand_dims(istd, axis=-1)
+        self._istd = np.expand_dims(istd, axis=0)
    def write_to_file(self, filepath):
        """Write the mean and stddev to the file.

--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -326,10 +326,8 @@ class SpeechCollator():
            audio, text = self.process_feature_and_transform(audio, text)
            #utt
            utts.append(utt)
-            # audio
            # print("---debug---")
            # print(audio.shape)
-            audio=audio.T
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])
            # text
@@ -358,7 +356,7 @@ class SpeechCollator():
        self.randomize_feature_parameters(min(audio_lens), n_bins)
        for i in range(len(padded_audios)):
            if not self._randomize_each_batch: 
-                self.randomize_feature_parameters(n_bins, audio_lens[i])
+                self.randomize_feature_parameters(audio_lens[i], n_bins)
            padded_audios[i] = self._augmentation_pipeline.apply_feature_transform(padded_audios[i])
        return utts, padded_audios, audio_lens, padded_texts, text_lens

--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@@ -11,7 +11,7 @@ data:
  max_output_input_ratio: .inf
 collator:
-  batch_size: 32 #64 # one gpu
+  batch_size: 64 # one gpu
  randomize_each_batch: False
  mean_std_filepath: data/mean_std.json
  unit_type: char