未验证 提交 caa391f4 编写于 作者: 小湉湉's avatar 小湉湉 提交者: GitHub

fix speedyspeech inference, test=tts (#1322)

上级 0c4895cd
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import numpy as np
import paddle import paddle
from paddle import nn from paddle import nn
...@@ -23,18 +22,16 @@ def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor: ...@@ -23,18 +22,16 @@ def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor:
encodings: (B, T, C) encodings: (B, T, C)
durations: (B, T) durations: (B, T)
""" """
batch_size, t_enc = durations.shape batch_size, t_enc = paddle.shape(durations)
durations = durations.numpy() slens = paddle.sum(durations, -1)
slens = np.sum(durations, -1) t_dec = paddle.max(slens)
t_dec = np.max(slens) M = paddle.zeros([batch_size, t_dec, t_enc])
M = np.zeros([batch_size, t_dec, t_enc])
for i in range(batch_size): for i in range(batch_size):
k = 0 k = 0
for j in range(t_enc): for j in range(t_enc):
d = durations[i, j] d = durations[i, j]
M[i, k:k + d, j] = 1 M[i, k:k + d, j] = 1
k += d k += d
M = paddle.to_tensor(M, dtype=encodings.dtype)
encodings = paddle.matmul(M, encodings) encodings = paddle.matmul(M, encodings)
return encodings return encodings
...@@ -234,28 +231,14 @@ class SpeedySpeech(nn.Layer): ...@@ -234,28 +231,14 @@ class SpeedySpeech(nn.Layer):
encodings = self.encoder(text, tones, spk_id) encodings = self.encoder(text, tones, spk_id)
if type(durations) == type(None): if durations is None:
pred_durations = self.duration_predictor(encodings) # (1, T) # (1, T)
pred_durations = self.duration_predictor(encodings)
durations_to_expand = paddle.round(pred_durations.exp()) durations_to_expand = paddle.round(pred_durations.exp())
durations_to_expand = (durations_to_expand).astype(paddle.int64) durations_to_expand = durations_to_expand.astype(paddle.int64)
slens = paddle.sum(durations_to_expand, -1) # [1]
t_dec = slens[0] # [1]
t_enc = paddle.shape(pred_durations)[-1]
M = paddle.zeros([1, t_dec, t_enc])
k = paddle.full([1], 0, dtype=paddle.int64)
for j in range(t_enc):
d = durations_to_expand[0, j]
# If the d == 0, slice action is meaningless and not supported
if d >= 1:
M[0, k:k + d, j] = 1
k += d
encodings = paddle.matmul(M, encodings)
else: else:
durations_to_expand = durations durations_to_expand = durations
encodings = expand(encodings, durations_to_expand) encodings = expand(encodings, durations_to_expand)
shape = paddle.shape(encodings) shape = paddle.shape(encodings)
t_dec, feature_size = shape[1], shape[2] t_dec, feature_size = shape[1], shape[2]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册