update wavernn, test=tts

1b0c0341 · 小湉湉 · 001afee6 · 1b0c0341 · 1b0c0341 · 1b0c0341
5 changed file
--- a/examples/csmsc/voc6/conf/default.yaml
+++ b/examples/csmsc/voc6/conf/default.yaml
@@ -12,7 +12,6 @@ n_mels: 80               # Number of mel basis.
 fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
 fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
 mu_law: True             # Recommended to suppress noise if using raw bitsexit()
-peak_norm: True


 ###########################################################
@@ -22,13 +21,14 @@ model:
    rnn_dims: 512                     # Hidden dims of RNN Layers.
    fc_dims: 512
    bits: 9                           # Bit depth of signal
-    aux_context_window: 2
+    aux_context_window: 2             # Context window size for auxiliary feature.
+                                      # If set to 2, previous 2 and future 2 frames will be considered.
    aux_channels: 80                  # Number of channels for auxiliary feature conv.
                                      # Must be the same as num_mels.
    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size, same with pwgan here
-    compute_dims: 128
-    res_out_dims: 128
-    res_blocks: 10
+    compute_dims: 128                 # Dims of Conv1D in MelResNet.
+    res_out_dims: 128                 # Dims of output in MelResNet.
+    res_blocks: 10                    # Number of residual blocks.
    mode: RAW                         # either 'raw'(softmax on raw bits) or 'mold' (sample from mixture of logistics)
 inference:
    gen_batched: True                 # whether to genenate sample in batch mode
@@ -42,7 +42,6 @@ inference:
 batch_size: 64              # Batch size.
 batch_max_steps: 4500       # Length of each audio in batch. Make sure dividable by hop_size.
 num_workers: 2              # Number of workers in DataLoader.
-valid_size: 50

 ###########################################################
 #                     OPTIMIZER SETTING                   #

--- a/paddlespeech/t2s/audio/__init__.py
+++ b/paddlespeech/t2s/audio/__init__.py
@@ -12,5 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .audio import AudioProcessor
+from .codec import *
 from .spec_normalizer import LogMagnitude
 from .spec_normalizer import NormalizerBase
--- a/paddlespeech/t2s/audio/codec.py
+++ b/paddlespeech/t2s/audio/codec.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import numpy as np
+import paddle
+
+
+# x: [0: 2**bit-1], return: [-1, 1]
+def label_2_float(x, bits):
+    return 2 * x / (2**bits - 1.) - 1.
+
+
+#x: [-1, 1], return: [0, 2**bits-1]
+def float_2_label(x, bits):
+    assert abs(x).max() <= 1.0
+    x = (x + 1.) * (2**bits - 1) / 2
+    return x.clip(0, 2**bits - 1)
+
+
+# y: [-1, 1], mu: 2**bits, return: [0, 2**bits-1]
+# see https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+# be careful the input `mu` here, which is +1 than that of the link above
+def encode_mu_law(x, mu):
+    mu = mu - 1
+    fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
+    return np.floor((fx + 1) / 2 * mu + 0.5)
+
+
+# from_labels = True:
+# y: [0: 2**bit-1], mu: 2**bits, return: [-1,1]
+# from_labels = False:
+# y: [-1, 1], return: [-1, 1]
+def decode_mu_law(y, mu, from_labels=True):
+    # TODO: get rid of log2 - makes no sense
+    if from_labels:
+        y = label_2_float(y, math.log2(mu))
+    mu = mu - 1
+    x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1)
+    return x
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@@ -11,35 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
-
 import numpy as np
 import paddle

-
-def label_2_float(x, bits):
-    return 2 * x / (2**bits - 1.) - 1.
-
-
-def float_2_label(x, bits):
-    assert abs(x).max() <= 1.0
-    x = (x + 1.) * (2**bits - 1) / 2
-    return x.clip(0, 2**bits - 1)
-
-
-def encode_mu_law(x, mu):
-    mu = mu - 1
-    fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
-    return np.floor((fx + 1) / 2 * mu + 0.5)
-
-
-def decode_mu_law(y, mu, from_labels=True):
-    # TODO: get rid of log2 - makes no sense
-    if from_labels:
-        y = label_2_float(y, math.log2(mu))
-    mu = mu - 1
-    x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1)
-    return x
+from paddlespeech.t2s.audio.codec import encode_mu_law
+from paddlespeech.t2s.audio.codec import float_2_label
+from paddlespeech.t2s.audio.codec import label_2_float


 class Clip(object):
@@ -195,10 +172,12 @@ class WaveRNNClip(Clip):
        Returns
        ----------
        Tensor
-            Auxiliary feature batch (B, C, T'), where
-            T = (T' - 2 * aux_context_window) * hop_size.
+            Input signal batch (B, 1, T).
        Tensor
            Target signal batch (B, 1, T).
+        Tensor
+            Auxiliary feature batch (B, C, T'), where
+            T = (T' - 2 * aux_context_window) * hop_size.

        """
        # check length

--- a/paddlespeech/t2s/models/wavernn/wavernn.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn.py
@@ -20,7 +20,7 @@ import paddle
 from paddle import nn
 from paddle.nn import functional as F

-from paddlespeech.t2s.datasets.vocoder_batch_fn import decode_mu_law
+from paddlespeech.t2s.audio.codec import decode_mu_law
 from paddlespeech.t2s.modules.losses import sample_from_discretized_mix_logistic
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.upsample import Stretch2D
@@ -28,7 +28,7 @@ from paddlespeech.t2s.modules.upsample import Stretch2D

 class ResBlock(nn.Layer):
    def __init__(self, dims):
-        super(ResBlock, self).__init__()
+        super().__init__()
        self.conv1 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
        self.conv2 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
        self.batch_norm1 = nn.BatchNorm1D(dims)
@@ -205,7 +205,7 @@ class WaveRNN(nn.Layer):
        if self.mode == 'RAW':
            self.n_classes = 2**bits
        elif self.mode == 'MOL':
-            self.n_classes = 30
+            self.n_classes = 10 * 3
        else:
            RuntimeError('Unknown model mode value - ', self.mode)

@@ -333,7 +333,7 @@ class WaveRNN(nn.Layer):
        # (T, C_aux) -> (1, C_aux, T)
        c = paddle.transpose(c, [1, 0]).unsqueeze(0)
        T = paddle.shape(c)[-1]
-        wave_len = (T - 1) * self.hop_length
+        wave_len = T * self.hop_length
        # TODO remove two transpose op by modifying function pad_tensor
        c = self.pad_tensor(
            c.transpose([0, 2, 1]), pad=self.aux_context_window,
@@ -396,6 +396,8 @@ class WaveRNN(nn.Layer):
                posterior = F.softmax(logits, axis=1)
                distrib = paddle.distribution.Categorical(posterior)
                # corresponding operate [np.floor((fx + 1) / 2 * mu + 0.5)] in enocde_mu_law
+                # distrib.sample([1])[0].cast('float32'): [0, 2**bits-1]
+                # sample: [-1, 1]
                sample = 2 * distrib.sample([1])[0].cast('float32') / (
                    self.n_classes - 1.) - 1.
                output.append(sample)
@@ -418,9 +420,9 @@ class WaveRNN(nn.Layer):
            output = output[0]

        # Fade-out at the end to avoid signal cutting out suddenly
-        fade_out = paddle.linspace(1, 0, 20 * self.hop_length)
+        fade_out = paddle.linspace(1, 0, 10 * self.hop_length)
        output = output[:wave_len]
-        output[-20 * self.hop_length:] *= fade_out
+        output[-10 * self.hop_length:] *= fade_out

        self.train()