提交 8b0e344c 编写于 作者: H Hui Zhang

fix logfbank using PCM16

上级 d62092ac
...@@ -23,7 +23,3 @@ process: ...@@ -23,7 +23,3 @@ process:
n_mask: 2 n_mask: 2
inplace: true inplace: true
replace_with_zero: true replace_with_zero: true
...@@ -25,6 +25,8 @@ import soxbindings as sox ...@@ -25,6 +25,8 @@ import soxbindings as sox
from scipy import signal from scipy import signal
from .utility import subfile_from_tar from .utility import subfile_from_tar
from .utility import convert_samples_to_float32
from .utility import convert_samples_from_float32
class AudioSegment(): class AudioSegment():
...@@ -689,15 +691,7 @@ class AudioSegment(): ...@@ -689,15 +691,7 @@ class AudioSegment():
Audio sample type is usually integer or float-point. Audio sample type is usually integer or float-point.
Integers will be scaled to [-1, 1] in float32. Integers will be scaled to [-1, 1] in float32.
""" """
float32_samples = samples.astype('float32') return convert_samples_to_float32(samples)
if samples.dtype in np.sctypes['int']:
bits = np.iinfo(samples.dtype).bits
float32_samples *= (1. / 2**(bits - 1))
elif samples.dtype in np.sctypes['float']:
pass
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return float32_samples
def _convert_samples_from_float32(self, samples, dtype): def _convert_samples_from_float32(self, samples, dtype):
"""Convert sample type from float32 to dtype. """Convert sample type from float32 to dtype.
...@@ -708,20 +702,4 @@ class AudioSegment(): ...@@ -708,20 +702,4 @@ class AudioSegment():
This is for writing a audio file. This is for writing a audio file.
""" """
dtype = np.dtype(dtype) return convert_samples_from_float32(samples, dtype)
output_samples = samples.copy()
if dtype in np.sctypes['int']:
bits = np.iinfo(dtype).bits
output_samples *= (2**(bits - 1) / 1.)
min_val = np.iinfo(dtype).min
max_val = np.iinfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
elif samples.dtype in np.sctypes['float']:
min_val = np.finfo(dtype).min
max_val = np.finfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return output_samples.astype(dtype)
...@@ -30,7 +30,8 @@ logger = Log(__name__).getlog() ...@@ -30,7 +30,8 @@ logger = Log(__name__).getlog()
__all__ = [ __all__ = [
"load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs", "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
"max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS", "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
"EOS", "UNK", "BLANK", "MASKCTC", "SPACE" "EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32",
"convert_samples_from_float32"
] ]
IGNORE_ID = -1 IGNORE_ID = -1
...@@ -342,3 +343,51 @@ def load_cmvn(cmvn_file: str, filetype: str): ...@@ -342,3 +343,51 @@ def load_cmvn(cmvn_file: str, filetype: str):
else: else:
raise ValueError(f"cmvn file type no support: {filetype}") raise ValueError(f"cmvn file type no support: {filetype}")
return cmvn[0], cmvn[1] return cmvn[0], cmvn[1]
def convert_samples_to_float32(samples):
"""Convert sample type to float32.
Audio sample type is usually integer or float-point.
Integers will be scaled to [-1, 1] in float32.
PCM16 -> PCM32
"""
float32_samples = samples.astype('float32')
if samples.dtype in np.sctypes['int']:
bits = np.iinfo(samples.dtype).bits
float32_samples *= (1. / 2**(bits - 1))
elif samples.dtype in np.sctypes['float']:
pass
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return float32_samples
def convert_samples_from_float32(samples, dtype):
"""Convert sample type from float32 to dtype.
Audio sample type is usually integer or float-point. For integer
type, float32 will be rescaled from [-1, 1] to the maximum range
supported by the integer type.
PCM32 -> PCM16
"""
dtype = np.dtype(dtype)
output_samples = samples.copy()
if dtype in np.sctypes['int']:
bits = np.iinfo(dtype).bits
output_samples *= (2**(bits - 1) / 1.)
min_val = np.iinfo(dtype).min
max_val = np.iinfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
elif samples.dtype in np.sctypes['float']:
min_val = np.finfo(dtype).min
max_val = np.finfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return output_samples.astype(dtype)
...@@ -307,6 +307,9 @@ class IStft(): ...@@ -307,6 +307,9 @@ class IStft():
center=self.center, ) center=self.center, )
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
class LogMelSpectrogramKaldi(): class LogMelSpectrogramKaldi():
def __init__( def __init__(
self, self,
...@@ -346,7 +349,7 @@ class LogMelSpectrogramKaldi(): ...@@ -346,7 +349,7 @@ class LogMelSpectrogramKaldi():
def __repr__(self): def __repr__(self):
return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
"n_shift={n_shift}, win_length={win_length}, window={window}, " "n_shift={n_shift}, win_length={win_length}, window={window}, "
"fmin={fmin}, fmax={fmax}, eps={eps}))".format( "fmin={fmin}, fmax={fmax}, eps={eps}, preemph={preemph}, window={window}, dither={dither}))".format(
name=self.__class__.__name__, name=self.__class__.__name__,
fs=self.fs, fs=self.fs,
n_mels=self.n_mels, n_mels=self.n_mels,
...@@ -356,7 +359,10 @@ class LogMelSpectrogramKaldi(): ...@@ -356,7 +359,10 @@ class LogMelSpectrogramKaldi():
window=self.window, window=self.window,
fmin=self.fmin, fmin=self.fmin,
fmax=self.fmax, fmax=self.fmax,
eps=self.eps, )) eps=self.eps,
preemph=self.preemph,
window=self.window,
dither=self.dither))
def __call__(self, x): def __call__(self, x):
""" """
...@@ -372,9 +378,16 @@ class LogMelSpectrogramKaldi(): ...@@ -372,9 +378,16 @@ class LogMelSpectrogramKaldi():
""" """
if x.ndim != 1: if x.ndim != 1:
raise ValueError("Not support x: [Time, Channel]") raise ValueError("Not support x: [Time, Channel]")
if x.dtype == np.int16:
x = x / 2**(16 - 1) logger.info(f"in {x}")
return logfbank( if x.dtype in np.sctypes['float']:
# PCM32 -> PCM16
bits = np.iinfo(np.int16).bits
x = x * 2**(bits - 1)
logger.info(f"b {x}")
# logfbank need PCM16 input
y = logfbank(
signal=x, signal=x,
samplerate=self.fs, samplerate=self.fs,
winlen=self.win_length, # unit ms winlen=self.win_length, # unit ms
...@@ -387,3 +400,7 @@ class LogMelSpectrogramKaldi(): ...@@ -387,3 +400,7 @@ class LogMelSpectrogramKaldi():
remove_dc_offset=self.remove_dc_offset, remove_dc_offset=self.remove_dc_offset,
preemph=self.preemph, preemph=self.preemph,
wintype=self.window) wintype=self.window)
logger.info(f"a {y}")
return y
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册