Deep Voice 3 + WaveFlow Noisy Output
Created by: aayushkubb
I am trying to use WaveFlow vocoder with deepvoice3. In order to implement the same i have made minor tweaks in the codebase,:
Firstly i have modified the examples/deepvoice3/utils.py to output only mel bands rather than the synthesized wav
@fluid.framework.dygraph_only
def eval_model(model, text, replace_pronounciation_prob, min_level_db,
ref_level_db, power, n_iter, win_length, hop_length,
preemphasis,mel_only=False):
"""generate waveform from text using a deepvoice 3 model"""
text = np.array(
en.text_to_sequence(
text, p=replace_pronounciation_prob),
dtype=np.int64)
length = len(text)
print("text sequence's length: {}".format(length))
text_positions = np.arange(1, 1 + length)
text = np.expand_dims(text, 0)
text_positions = np.expand_dims(text_positions, 0)
model.eval()
mel_outputs, linear_outputs, alignments, done = model.transduce(
dg.to_variable(text), dg.to_variable(text_positions))
if mel_only:
return mel_outputs,alignments.numpy()[0]
linear_outputs_np = linear_outputs.numpy()[0].T # (C, T)
wav = spec_to_waveform(linear_outputs_np, min_level_db, ref_level_db,
power, n_iter, win_length, hop_length, preemphasis)
alignments_np = alignments.numpy()[0] # batch_size = 1
print("linear_outputs's shape: ", linear_outputs_np.shape)
print("alignmnets' shape:", alignments.shape)
return wav, alignments_np
Now i call the modified eval_model from deepvoice3 to return mel output
mel_wav, attn = eval_model(dv3, text, replace_pronounciation_prob,
min_level_db, ref_level_db, power,
n_iter, win_length, hop_length,
preemphasis,mel_only=mel_only)
mel=mel_wav
Reshape mel to match waveflow's mel input
a,b,c = mel.shape
mel_new=F.reshape(mel,(a,c,b))
Once i have these mels, i pass it to waveflow for synthesis
waveflow_model = WaveFlow(waveflow_config,args.waveflow_checkpoint_dir)
waveflow_iteration = waveflow_model.build()
@dg.no_grad
def infer(self, mel):
# self.waveflow.eval()
config = self.config
print(mel.shape, 'mel.shape')
start_time = time.time()
audio = self.waveflow.synthesize(mel, sigma=self.config.sigma)
syn_time = time.time() - start_time
return audio,start_time,syn_time
#create wav
wav, start_time,syn_time = waveflow_model.infer(mel_new)
wav = wav[0]
wav_time = wav.shape[0] / waveflow_config.sample_rate
print("audio time {:.4f}, synthesis time {:.4f}".format(wav_time,
syn_time))
# Denormalize audio from [-1, 1] to [-32768, 32768] int16 range.
wav = wav.numpy().astype("float32") * 32768.0
wav = wav.astype('int16')
sample_rate = waveflow_config.sample_rate
plot_alignment(
attn,
os.path.join(synthesis_dir,
"test_{}_step_{}.png".format(idx, iteration)))
sf.write(
os.path.join(synthesis_dir,
"test_{}_step{}.wav".format(idx, iteration)),
wav, sample_rate)
But I only get either noise or blank wav output.
I also tried processing mels similar to what waveflow does,
def process_mel(mel,config):
'''Normalize mel similar to waveflow'''
clip_val = 1e-5
ref_constant = 100
mel = fluid.layers.clip(x=mel,min=clip_val,max=10)
mel = fluid.layers.scale( x=mel,scale=ref_constant)
mel = fluid.layers.log(mel)
return mel
but still the results are same, Can you help me identify what exactly i am doing wrong?
My assumption is that I am not properly supplying mels to the waveflow.
Thanks