提交 47915461 编写于 作者: Y Yibing Liu

Adapt waveflow to internal dataset

上级 e58e927c
...@@ -35,8 +35,7 @@ class Dataset(ljspeech.LJSpeech): ...@@ -35,8 +35,7 @@ class Dataset(ljspeech.LJSpeech):
fname, _, _ = metadatum fname, _, _ = metadatum
wav_path = os.path.join(self.root, "wavs", fname + ".wav") wav_path = os.path.join(self.root, "wavs", fname + ".wav")
loaded_sr, audio = read(wav_path) audio, loaded_sr = librosa.load(wav_path, sr=self.config.sample_rate)
assert loaded_sr == self.config.sample_rate
return audio return audio
...@@ -91,8 +90,6 @@ class Subset(DatasetMixin): ...@@ -91,8 +90,6 @@ class Subset(DatasetMixin):
mode='constant', mode='constant',
constant_values=0) constant_values=0)
# Normalize audio to the [-1, 1] range.
audio = audio.astype(np.float32) / 32768.0
mel = self.get_mel(audio) mel = self.get_mel(audio)
return audio, mel return audio, mel
......
...@@ -62,9 +62,8 @@ class WaveFlowLoss: ...@@ -62,9 +62,8 @@ class WaveFlowLoss:
class Conditioner(dg.Layer): class Conditioner(dg.Layer):
def __init__(self, dtype): def __init__(self, dtype, upsample_factors):
super(Conditioner, self).__init__() super(Conditioner, self).__init__()
upsample_factors = [16, 16]
self.upsample_conv2d = [] self.upsample_conv2d = []
for s in upsample_factors: for s in upsample_factors:
...@@ -296,11 +295,13 @@ class WaveFlowModule(dg.Layer): ...@@ -296,11 +295,13 @@ class WaveFlowModule(dg.Layer):
self.n_flows = config.n_flows self.n_flows = config.n_flows
self.n_group = config.n_group self.n_group = config.n_group
self.n_layers = config.n_layers self.n_layers = config.n_layers
self.upsample_factors = config.upsample_factors if hasattr(
config, "upsample_factors") else [16, 16]
assert self.n_group % 2 == 0 assert self.n_group % 2 == 0
assert self.n_flows % 2 == 0 assert self.n_flows % 2 == 0
self.dtype = "float16" if config.use_fp16 else "float32" self.dtype = "float16" if config.use_fp16 else "float32"
self.conditioner = Conditioner(self.dtype) self.conditioner = Conditioner(self.dtype, self.upsample_factors)
self.flows = [] self.flows = []
for i in range(self.n_flows): for i in range(self.n_flows):
flow = Flow(config) flow = Flow(config)
...@@ -397,6 +398,10 @@ class WaveFlowModule(dg.Layer): ...@@ -397,6 +398,10 @@ class WaveFlowModule(dg.Layer):
if self.dtype == "float16": if self.dtype == "float16":
mel = fluid.layers.cast(mel, self.dtype) mel = fluid.layers.cast(mel, self.dtype)
mel = self.conditioner.infer(mel) mel = self.conditioner.infer(mel)
# Prune out the tail of mel so that time/n_group == 0.
pruned_len = int(mel.shape[2] // self.n_group * self.n_group)
if mel.shape[2] > pruned_len:
mel = mel[:, :, :pruned_len]
# From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group] # From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2]) mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册