提交 3cbfd7bf 编写于 作者: J Jerryuhoo

Add speaker embedding and speaker id for style fastspeech2 inference

上级 db121226
...@@ -907,7 +907,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): ...@@ -907,7 +907,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
energy: Union[paddle.Tensor, np.ndarray]=None, energy: Union[paddle.Tensor, np.ndarray]=None,
energy_scale: Union[int, float]=None, energy_scale: Union[int, float]=None,
energy_bias: Union[int, float]=None, energy_bias: Union[int, float]=None,
robot: bool=False): robot: bool=False,
spk_emb=None,
spk_id=None):
""" """
Parameters Parameters
---------- ----------
...@@ -938,8 +940,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): ...@@ -938,8 +940,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
Tensor Tensor
Output sequence of features (L, odim). Output sequence of features (L, odim).
""" """
spk_id = paddle.to_tensor(spk_id)
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text, durations=None, pitch=None, energy=None) text, durations=None, pitch=None, energy=None, spk_emb=spk_emb, spk_id=spk_id)
# priority: groundtruth > scale/bias > previous output # priority: groundtruth > scale/bias > previous output
# set durations # set durations
if isinstance(durations, np.ndarray): if isinstance(durations, np.ndarray):
...@@ -991,7 +994,10 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): ...@@ -991,7 +994,10 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
durations=durations, durations=durations,
pitch=pitch, pitch=pitch,
energy=energy, energy=energy,
use_teacher_forcing=True) use_teacher_forcing=True,
spk_emb=spk_emb,
spk_id=spk_id
)
logmel = self.normalizer.inverse(normalized_mel) logmel = self.normalizer.inverse(normalized_mel)
return logmel return logmel
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册