提交 17092cbb 编写于 作者: H Hui Zhang

D,T to T,D

上级 0c9fbaf7
...@@ -24,8 +24,10 @@ import soundfile ...@@ -24,8 +24,10 @@ import soundfile
import soxbindings as sox import soxbindings as sox
from scipy import signal from scipy import signal
from .utility import subfile_from_tar
class AudioSegment(object):
class AudioSegment():
"""Monaural audio segment abstraction. """Monaural audio segment abstraction.
:param samples: Audio samples [num_samples x num_channels]. :param samples: Audio samples [num_samples x num_channels].
...@@ -68,16 +70,20 @@ class AudioSegment(object): ...@@ -68,16 +70,20 @@ class AudioSegment(object):
self.duration, self.rms_db)) self.duration, self.rms_db))
@classmethod @classmethod
def from_file(cls, file): def from_file(cls, file, infos=None):
"""Create audio segment from audio file. """Create audio segment from audio file.
:param filepath: Filepath or file object to audio file. Args:
:type filepath: str|file filepath (str|file): Filepath or file object to audio file.
:return: Audio segment instance. infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
:rtype: AudioSegment
Returns:
AudioSegment: Audio segment instance.
""" """
if isinstance(file, str) and re.findall(r".seqbin_\d+$", file): if isinstance(file, str) and re.findall(r".seqbin_\d+$", file):
return cls.from_sequence_file(file) return cls.from_sequence_file(file)
elif isinstance(file, str) and file.startswith('tar:'):
return cls.from_file(subfile_from_tar(file, infos))
else: else:
samples, sample_rate = soundfile.read(file, dtype='float32') samples, sample_rate = soundfile.read(file, dtype='float32')
return cls(samples, sample_rate) return cls(samples, sample_rate)
......
...@@ -29,10 +29,10 @@ class SpecAugmentor(AugmentorBase): ...@@ -29,10 +29,10 @@ class SpecAugmentor(AugmentorBase):
SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
https://arxiv.org/abs/1904.08779 https://arxiv.org/abs/1904.08779
SpecAugment on Large Scale Datasets SpecAugment on Large Scale Datasets
https://arxiv.org/abs/1912.05533 https://arxiv.org/abs/1912.05533
""" """
def __init__(self, def __init__(self,
...@@ -61,7 +61,7 @@ class SpecAugmentor(AugmentorBase): ...@@ -61,7 +61,7 @@ class SpecAugmentor(AugmentorBase):
adaptive_size_ratio (float): adaptive size ratio for time masking adaptive_size_ratio (float): adaptive size ratio for time masking
max_n_time_masks (int): maximum number of time masking max_n_time_masks (int): maximum number of time masking
replace_with_zero (bool): pad zero on mask if true else use mean replace_with_zero (bool): pad zero on mask if true else use mean
warp_mode (str): "PIL" (default, fast, not differentiable) warp_mode (str): "PIL" (default, fast, not differentiable)
or "sparse_image_warp" (slow, differentiable) or "sparse_image_warp" (slow, differentiable)
""" """
super().__init__() super().__init__()
...@@ -133,7 +133,7 @@ class SpecAugmentor(AugmentorBase): ...@@ -133,7 +133,7 @@ class SpecAugmentor(AugmentorBase):
return self._time_mask return self._time_mask
def __repr__(self): def __repr__(self):
return f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}" return f"specaug: F-{self.F}, T-{self.T}, F-n-{self.n_freq_masks}, T-n-{self.n_time_masks}"
def time_warp(self, x, mode='PIL'): def time_warp(self, x, mode='PIL'):
"""time warp for spec augment """time warp for spec augment
......
...@@ -11,3 +11,6 @@ ...@@ -11,3 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .audio_featurizer import AudioFeaturizer #noqa: F401
from .speech_featurizer import SpeechFeaturizer
from .text_featurizer import TextFeaturizer
...@@ -18,7 +18,7 @@ from python_speech_features import logfbank ...@@ -18,7 +18,7 @@ from python_speech_features import logfbank
from python_speech_features import mfcc from python_speech_features import mfcc
class AudioFeaturizer(object): class AudioFeaturizer():
"""Audio featurizer, for extracting features from audio contents of """Audio featurizer, for extracting features from audio contents of
AudioSegment or SpeechSegment. AudioSegment or SpeechSegment.
...@@ -167,32 +167,6 @@ class AudioFeaturizer(object): ...@@ -167,32 +167,6 @@ class AudioFeaturizer(object):
raise ValueError("Unknown spectrum_type %s. " raise ValueError("Unknown spectrum_type %s. "
"Supported values: linear." % self._spectrum_type) "Supported values: linear." % self._spectrum_type)
def _compute_linear_specgram(self,
samples,
sample_rate,
stride_ms=10.0,
window_ms=20.0,
max_freq=None,
eps=1e-14):
"""Compute the linear spectrogram from FFT energy."""
if max_freq is None:
max_freq = sample_rate / 2
if max_freq > sample_rate / 2:
raise ValueError("max_freq must not be greater than half of "
"sample rate.")
if stride_ms > window_ms:
raise ValueError("Stride size must not be greater than "
"window size.")
stride_size = int(0.001 * sample_rate * stride_ms)
window_size = int(0.001 * sample_rate * window_ms)
specgram, freqs = self._specgram_real(
samples,
window_size=window_size,
stride_size=stride_size,
sample_rate=sample_rate)
ind = np.where(freqs <= max_freq)[0][-1] + 1
return np.log(specgram[:ind, :] + eps)
def _specgram_real(self, samples, window_size, stride_size, sample_rate): def _specgram_real(self, samples, window_size, stride_size, sample_rate):
"""Compute the spectrogram for samples from a real signal.""" """Compute the spectrogram for samples from a real signal."""
# extract strided windows # extract strided windows
...@@ -217,26 +191,65 @@ class AudioFeaturizer(object): ...@@ -217,26 +191,65 @@ class AudioFeaturizer(object):
freqs = float(sample_rate) / window_size * np.arange(fft.shape[0]) freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
return fft, freqs return fft, freqs
def _compute_linear_specgram(self,
samples,
sample_rate,
stride_ms=10.0,
window_ms=20.0,
max_freq=None,
eps=1e-14):
"""Compute the linear spectrogram from FFT energy.
Args:
samples ([type]): [description]
sample_rate ([type]): [description]
stride_ms (float, optional): [description]. Defaults to 10.0.
window_ms (float, optional): [description]. Defaults to 20.0.
max_freq ([type], optional): [description]. Defaults to None.
eps ([type], optional): [description]. Defaults to 1e-14.
Raises:
ValueError: [description]
ValueError: [description]
Returns:
np.ndarray: log spectrogram, (time, freq)
"""
if max_freq is None:
max_freq = sample_rate / 2
if max_freq > sample_rate / 2:
raise ValueError("max_freq must not be greater than half of "
"sample rate.")
if stride_ms > window_ms:
raise ValueError("Stride size must not be greater than "
"window size.")
stride_size = int(0.001 * sample_rate * stride_ms)
window_size = int(0.001 * sample_rate * window_ms)
specgram, freqs = self._specgram_real(
samples,
window_size=window_size,
stride_size=stride_size,
sample_rate=sample_rate)
ind = np.where(freqs <= max_freq)[0][-1] + 1
# (freq, time)
spec = np.log(specgram[:ind, :] + eps)
return np.transpose(spec)
def _concat_delta_delta(self, feat): def _concat_delta_delta(self, feat):
"""append delat, delta-delta feature. """append delat, delta-delta feature.
Args: Args:
feat (np.ndarray): (D, T) feat (np.ndarray): (T, D)
Returns: Returns:
np.ndarray: feat with delta-delta, (3*D, T) np.ndarray: feat with delta-delta, (T, 3*D)
""" """
feat = np.transpose(feat)
# Deltas # Deltas
d_feat = delta(feat, 2) d_feat = delta(feat, 2)
# Deltas-Deltas # Deltas-Deltas
dd_feat = delta(feat, 2) dd_feat = delta(feat, 2)
# transpose
feat = np.transpose(feat)
d_feat = np.transpose(d_feat)
dd_feat = np.transpose(dd_feat)
# concat above three features # concat above three features
concat_feat = np.concatenate((feat, d_feat, dd_feat)) concat_feat = np.concatenate((feat, d_feat, dd_feat), axis=1)
return concat_feat return concat_feat
def _compute_mfcc(self, def _compute_mfcc(self,
...@@ -292,7 +305,6 @@ class AudioFeaturizer(object): ...@@ -292,7 +305,6 @@ class AudioFeaturizer(object):
ceplifter=22, ceplifter=22,
useEnergy=True, useEnergy=True,
winfunc='povey') winfunc='povey')
mfcc_feat = np.transpose(mfcc_feat)
if delta_delta: if delta_delta:
mfcc_feat = self._concat_delta_delta(mfcc_feat) mfcc_feat = self._concat_delta_delta(mfcc_feat)
return mfcc_feat return mfcc_feat
...@@ -346,8 +358,6 @@ class AudioFeaturizer(object): ...@@ -346,8 +358,6 @@ class AudioFeaturizer(object):
remove_dc_offset=True, remove_dc_offset=True,
preemph=0.97, preemph=0.97,
wintype='povey') wintype='povey')
fbank_feat = np.transpose(fbank_feat)
if delta_delta: if delta_delta:
fbank_feat = self._concat_delta_delta(fbank_feat) fbank_feat = self._concat_delta_delta(fbank_feat)
return fbank_feat return fbank_feat
...@@ -16,38 +16,8 @@ from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer ...@@ -16,38 +16,8 @@ from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer
from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
class SpeechFeaturizer(object): class SpeechFeaturizer():
"""Speech featurizer, for extracting features from both audio and transcript """Speech and Text feature extraction.
contents of SpeechSegment.
Currently, for audio parts, it supports feature types of linear
spectrogram and mfcc; for transcript parts, it only supports char-level
tokenizing and conversion into a list of token indices. Note that the
token indexing order follows the given vocabulary file.
:param vocab_filepath: Filepath to load vocabulary for token indices
conversion.
:type spectrum_type: str
:param spectrum_type: Specgram feature type. Options: 'linear', 'mfcc'.
:type spectrum_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float
:param max_freq: When spectrum_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are
returned; when spectrum_type is 'mfcc', max_freq is the
highest band edge of mel filters.
:types max_freq: None|float
:param target_sample_rate: Speech are resampled (if upsampling or
downsampling is allowed) to this before
extracting spectrogram features.
:type target_sample_rate: float
:param use_dB_normalization: Whether to normalize the audio to a certain
decibels before extracting the features.
:type use_dB_normalization: bool
:param target_dB: Target audio decibels for normalization.
:type target_dB: float
""" """
def __init__(self, def __init__(self,
...@@ -64,8 +34,12 @@ class SpeechFeaturizer(object): ...@@ -64,8 +34,12 @@ class SpeechFeaturizer(object):
target_sample_rate=16000, target_sample_rate=16000,
use_dB_normalization=True, use_dB_normalization=True,
target_dB=-20, target_dB=-20,
dither=1.0): dither=1.0,
self._audio_featurizer = AudioFeaturizer( maskctc=False):
self.stride_ms = stride_ms
self.window_ms = window_ms
self.audio_feature = AudioFeaturizer(
spectrum_type=spectrum_type, spectrum_type=spectrum_type,
feat_dim=feat_dim, feat_dim=feat_dim,
delta_delta=delta_delta, delta_delta=delta_delta,
...@@ -77,8 +51,14 @@ class SpeechFeaturizer(object): ...@@ -77,8 +51,14 @@ class SpeechFeaturizer(object):
use_dB_normalization=use_dB_normalization, use_dB_normalization=use_dB_normalization,
target_dB=target_dB, target_dB=target_dB,
dither=dither) dither=dither)
self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath, self.feature_size = self.audio_feature.feature_size
spm_model_prefix)
self.text_feature = TextFeaturizer(
unit_type=unit_type,
vocab_filepath=vocab_filepath,
spm_model_prefix=spm_model_prefix,
maskctc=maskctc)
self.vocab_size = self.text_feature.vocab_size
def featurize(self, speech_segment, keep_transcription_text): def featurize(self, speech_segment, keep_transcription_text):
"""Extract features for speech segment. """Extract features for speech segment.
...@@ -94,66 +74,33 @@ class SpeechFeaturizer(object): ...@@ -94,66 +74,33 @@ class SpeechFeaturizer(object):
Returns: Returns:
tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices. tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices.
""" """
spec_feature = self._audio_featurizer.featurize(speech_segment) spec_feature = self.audio_feature.featurize(speech_segment)
if keep_transcription_text: if keep_transcription_text:
return spec_feature, speech_segment.transcript return spec_feature, speech_segment.transcript
if speech_segment.has_token: if speech_segment.has_token:
text_ids = speech_segment.token_ids text_ids = speech_segment.token_ids
else: else:
text_ids = self._text_featurizer.featurize( text_ids = self.text_feature.featurize(speech_segment.transcript)
speech_segment.transcript)
return spec_feature, text_ids return spec_feature, text_ids
@property def text_featurize(self, text, keep_transcription_text):
def vocab_size(self): """Extract features for speech segment.
"""Return the vocabulary size.
Returns:
int: Vocabulary size.
"""
return self._text_featurizer.vocab_size
@property
def vocab_list(self):
"""Return the vocabulary in list.
Returns:
List[str]:
"""
return self._text_featurizer.vocab_list
@property
def vocab_dict(self):
"""Return the vocabulary in dict.
Returns:
Dict[str, int]:
"""
return self._text_featurizer.vocab_dict
@property
def feature_size(self):
"""Return the audio feature size.
Returns: 1. For audio parts, extract the audio features.
int: audio feature size. 2. For transcript parts, keep the original text or convert text string
""" to a list of token indices in char-level.
return self._audio_featurizer.feature_size
@property Args:
def stride_ms(self): text (str): text.
"""time length in `ms` unit per frame keep_transcription_text (bool): True, keep transcript text, False, token ids
Returns: Returns:
float: time(ms)/frame (str|List[int]): text, or list of token indices.
""" """
return self._audio_featurizer.stride_ms if keep_transcription_text:
return text
@property
def text_feature(self):
"""Return the text feature object.
Returns: text_ids = self.text_feature.featurize(text)
TextFeaturizer: object. return text_ids
"""
return self._text_featurizer
...@@ -40,21 +40,21 @@ class CollateFunc(object): ...@@ -40,21 +40,21 @@ class CollateFunc(object):
number = 0 number = 0
for item in batch: for item in batch:
audioseg = AudioSegment.from_file(item['feat']) audioseg = AudioSegment.from_file(item['feat'])
feat = self.feature_func(audioseg) #(D, T) feat = self.feature_func(audioseg) #(T, D)
sums = np.sum(feat, axis=1) sums = np.sum(feat, axis=0)
if mean_stat is None: if mean_stat is None:
mean_stat = sums mean_stat = sums
else: else:
mean_stat += sums mean_stat += sums
square_sums = np.sum(np.square(feat), axis=1) square_sums = np.sum(np.square(feat), axis=0)
if var_stat is None: if var_stat is None:
var_stat = square_sums var_stat = square_sums
else: else:
var_stat += square_sums var_stat += square_sums
number += feat.shape[1] number += feat.shape[0]
return number, mean_stat, var_stat return number, mean_stat, var_stat
...@@ -120,7 +120,7 @@ class FeatureNormalizer(object): ...@@ -120,7 +120,7 @@ class FeatureNormalizer(object):
"""Normalize features to be of zero mean and unit stddev. """Normalize features to be of zero mean and unit stddev.
:param features: Input features to be normalized. :param features: Input features to be normalized.
:type features: ndarray, shape (D, T) :type features: ndarray, shape (T, D)
:param eps: added to stddev to provide numerical stablibity. :param eps: added to stddev to provide numerical stablibity.
:type eps: float :type eps: float
:return: Normalized features. :return: Normalized features.
...@@ -130,9 +130,10 @@ class FeatureNormalizer(object): ...@@ -130,9 +130,10 @@ class FeatureNormalizer(object):
def _read_mean_std_from_file(self, filepath, eps=1e-20): def _read_mean_std_from_file(self, filepath, eps=1e-20):
"""Load mean and std from file.""" """Load mean and std from file."""
mean, istd = load_cmvn(filepath, filetype='json') filetype = filepath.split(".")[-1]
self._mean = np.expand_dims(mean, axis=-1) mean, istd = load_cmvn(filepath, filetype=filetype)
self._istd = np.expand_dims(istd, axis=-1) self._mean = np.expand_dims(mean, axis=0)
self._istd = np.expand_dims(istd, axis=0)
def write_to_file(self, filepath): def write_to_file(self, filepath):
"""Write the mean and stddev to the file. """Write the mean and stddev to the file.
......
...@@ -68,7 +68,12 @@ class SpeechSegment(AudioSegment): ...@@ -68,7 +68,12 @@ class SpeechSegment(AudioSegment):
return not self.__eq__(other) return not self.__eq__(other)
@classmethod @classmethod
def from_file(cls, filepath, transcript, tokens=None, token_ids=None): def from_file(cls,
filepath,
transcript,
tokens=None,
token_ids=None,
infos=None):
"""Create speech segment from audio file and corresponding transcript. """Create speech segment from audio file and corresponding transcript.
Args: Args:
...@@ -76,12 +81,12 @@ class SpeechSegment(AudioSegment): ...@@ -76,12 +81,12 @@ class SpeechSegment(AudioSegment):
transcript (str): Transcript text for the speech. transcript (str): Transcript text for the speech.
tokens (List[str], optional): text tokens. Defaults to None. tokens (List[str], optional): text tokens. Defaults to None.
token_ids (List[int], optional): text token ids. Defaults to None. token_ids (List[int], optional): text token ids. Defaults to None.
infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
Returns: Returns:
SpeechSegment: Speech segment instance. SpeechSegment: Speech segment instance.
""" """
audio = AudioSegment.from_file(filepath, infos)
audio = AudioSegment.from_file(filepath)
return cls(audio.samples, audio.sample_rate, transcript, tokens, return cls(audio.samples, audio.sample_rate, transcript, tokens,
token_ids) token_ids)
......
...@@ -56,8 +56,8 @@ class SpeechCollator(): ...@@ -56,8 +56,8 @@ class SpeechCollator():
for utt, audio, text in batch: for utt, audio, text in batch:
utts.append(utt) utts.append(utt)
# audio # audio
audios.append(audio.T) # [T, D] audios.append(audio) # [T, D]
audio_lens.append(audio.shape[1]) audio_lens.append(audio.shape[0])
# text # text
# for training, text is token ids # for training, text is token ids
# else text is string, convert to unicode ord # else text is string, convert to unicode ord
......
...@@ -3,8 +3,9 @@ export MAIN_ROOT=${PWD}/../../../ ...@@ -3,8 +3,9 @@ export MAIN_ROOT=${PWD}/../../../
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8 export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册