diff --git a/deepspeech/frontend/featurizer/audio_featurizer.py b/deepspeech/frontend/featurizer/audio_featurizer.py index f209d305d418a459da4ba8e608d79f3870371fe6..b537e733511fbb7951781e5b3b0cffbbdd38b0e6 100644 --- a/deepspeech/frontend/featurizer/audio_featurizer.py +++ b/deepspeech/frontend/featurizer/audio_featurizer.py @@ -175,6 +175,7 @@ class AudioFeaturizer(object): max_freq=None, eps=1e-14): """Compute the linear spectrogram from FFT energy.""" + # return T,D if max_freq is None: max_freq = sample_rate / 2 if max_freq > sample_rate / 2: @@ -190,8 +191,12 @@ class AudioFeaturizer(object): window_size=window_size, stride_size=stride_size, sample_rate=sample_rate) + ind = np.where(freqs <= max_freq)[0][-1] + 1 - return np.log(specgram[:ind, :] + eps) + specgram = np.log(specgram[:ind, :] + eps) + + specgram = np.transpose(specgram) #T,D + return specgram def _specgram_real(self, samples, window_size, stride_size, sample_rate): """Compute the spectrogram for samples from a real signal.""" @@ -294,6 +299,7 @@ class AudioFeaturizer(object): ceplifter=22, useEnergy=True, winfunc='povey') + mfcc_feat = np.transpose(mfcc_feat) if delta_delta: mfcc_feat = self._concat_delta_delta(mfcc_feat) diff --git a/deepspeech/frontend/normalizer.py b/deepspeech/frontend/normalizer.py index 287b51e587c29d133fd56a31a3925954fb99fff5..0bf24edd10556acecec56de0685b0930bec6b1a6 100644 --- a/deepspeech/frontend/normalizer.py +++ b/deepspeech/frontend/normalizer.py @@ -131,8 +131,8 @@ class FeatureNormalizer(object): def _read_mean_std_from_file(self, filepath, eps=1e-20): """Load mean and std from file.""" mean, istd = load_cmvn(filepath, filetype='json') - self._mean = np.expand_dims(mean, axis=-1) - self._istd = np.expand_dims(istd, axis=-1) + self._mean = np.expand_dims(mean, axis=0) + self._istd = np.expand_dims(istd, axis=0) def write_to_file(self, filepath): """Write the mean and stddev to the file. diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index f105acc064cc02930696a69df97ce1506d6bdb07..514dc2cc3f72eb799f60aa60eab189d281606a43 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -326,10 +326,8 @@ class SpeechCollator(): audio, text = self.process_feature_and_transform(audio, text) #utt utts.append(utt) - # audio # print("---debug---") # print(audio.shape) - audio=audio.T audios.append(audio) # [T, D] audio_lens.append(audio.shape[0]) # text @@ -358,7 +356,7 @@ class SpeechCollator(): self.randomize_feature_parameters(min(audio_lens), n_bins) for i in range(len(padded_audios)): if not self._randomize_each_batch: - self.randomize_feature_parameters(n_bins, audio_lens[i]) + self.randomize_feature_parameters(audio_lens[i], n_bins) padded_audios[i] = self._augmentation_pipeline.apply_feature_transform(padded_audios[i]) return utts, padded_audios, audio_lens, padded_texts, text_lens diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 1fe21a406276046eaccbd9ec376944a49f5270c0..e7a5c6dcff5d59c377297f4ec98d8e84630cfcc2 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -11,7 +11,7 @@ data: max_output_input_ratio: .inf collator: - batch_size: 32 #64 # one gpu + batch_size: 64 # one gpu randomize_each_batch: False mean_std_filepath: data/mean_std.json unit_type: char