diff --git a/examples/csmsc/voc6/conf/default.yaml b/examples/csmsc/voc6/conf/default.yaml index 2c838fb998229f6faa08c6e19b2eb7e3a3edeb15..e7696cf40d5a5f791b5aa1b18efa7fc9512267f3 100644 --- a/examples/csmsc/voc6/conf/default.yaml +++ b/examples/csmsc/voc6/conf/default.yaml @@ -12,7 +12,6 @@ n_mels: 80 # Number of mel basis. fmin: 80 # Minimum freq in mel basis calculation. (Hz) fmax: 7600 # Maximum frequency in mel basis calculation. (Hz) mu_law: True # Recommended to suppress noise if using raw bitsexit() -peak_norm: True ########################################################### @@ -22,13 +21,14 @@ model: rnn_dims: 512 # Hidden dims of RNN Layers. fc_dims: 512 bits: 9 # Bit depth of signal - aux_context_window: 2 + aux_context_window: 2 # Context window size for auxiliary feature. + # If set to 2, previous 2 and future 2 frames will be considered. aux_channels: 80 # Number of channels for auxiliary feature conv. # Must be the same as num_mels. upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size, same with pwgan here - compute_dims: 128 - res_out_dims: 128 - res_blocks: 10 + compute_dims: 128 # Dims of Conv1D in MelResNet. + res_out_dims: 128 # Dims of output in MelResNet. + res_blocks: 10 # Number of residual blocks. mode: RAW # either 'raw'(softmax on raw bits) or 'mold' (sample from mixture of logistics) inference: gen_batched: True # whether to genenate sample in batch mode @@ -42,7 +42,6 @@ inference: batch_size: 64 # Batch size. batch_max_steps: 4500 # Length of each audio in batch. Make sure dividable by hop_size. num_workers: 2 # Number of workers in DataLoader. -valid_size: 50 ########################################################### # OPTIMIZER SETTING # diff --git a/paddlespeech/t2s/audio/__init__.py b/paddlespeech/t2s/audio/__init__.py index 7747b794536aab1aab1de18dc0b686b8814efdbb..0deefc8bc368fa5e4f6236569d539082fb4c37cb 100644 --- a/paddlespeech/t2s/audio/__init__.py +++ b/paddlespeech/t2s/audio/__init__.py @@ -12,5 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. from .audio import AudioProcessor +from .codec import * from .spec_normalizer import LogMagnitude from .spec_normalizer import NormalizerBase diff --git a/paddlespeech/t2s/audio/codec.py b/paddlespeech/t2s/audio/codec.py new file mode 100644 index 0000000000000000000000000000000000000000..2a759ce4ce231485204ac2b7b88f6d4943cc534a --- /dev/null +++ b/paddlespeech/t2s/audio/codec.py @@ -0,0 +1,51 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +import numpy as np +import paddle + + +# x: [0: 2**bit-1], return: [-1, 1] +def label_2_float(x, bits): + return 2 * x / (2**bits - 1.) - 1. + + +#x: [-1, 1], return: [0, 2**bits-1] +def float_2_label(x, bits): + assert abs(x).max() <= 1.0 + x = (x + 1.) * (2**bits - 1) / 2 + return x.clip(0, 2**bits - 1) + + +# y: [-1, 1], mu: 2**bits, return: [0, 2**bits-1] +# see https://en.wikipedia.org/wiki/%CE%9C-law_algorithm +# be careful the input `mu` here, which is +1 than that of the link above +def encode_mu_law(x, mu): + mu = mu - 1 + fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu) + return np.floor((fx + 1) / 2 * mu + 0.5) + + +# from_labels = True: +# y: [0: 2**bit-1], mu: 2**bits, return: [-1,1] +# from_labels = False: +# y: [-1, 1], return: [-1, 1] +def decode_mu_law(y, mu, from_labels=True): + # TODO: get rid of log2 - makes no sense + if from_labels: + y = label_2_float(y, math.log2(mu)) + mu = mu - 1 + x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1) + return x diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py index b1d22db97bae2733a86b8c1554abde900414b5d2..d969a1d3553712690c588663a10b41f7a07d5fc7 100644 --- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py +++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py @@ -11,35 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import math - import numpy as np import paddle - -def label_2_float(x, bits): - return 2 * x / (2**bits - 1.) - 1. - - -def float_2_label(x, bits): - assert abs(x).max() <= 1.0 - x = (x + 1.) * (2**bits - 1) / 2 - return x.clip(0, 2**bits - 1) - - -def encode_mu_law(x, mu): - mu = mu - 1 - fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu) - return np.floor((fx + 1) / 2 * mu + 0.5) - - -def decode_mu_law(y, mu, from_labels=True): - # TODO: get rid of log2 - makes no sense - if from_labels: - y = label_2_float(y, math.log2(mu)) - mu = mu - 1 - x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1) - return x +from paddlespeech.t2s.audio.codec import encode_mu_law +from paddlespeech.t2s.audio.codec import float_2_label +from paddlespeech.t2s.audio.codec import label_2_float class Clip(object): @@ -195,10 +172,12 @@ class WaveRNNClip(Clip): Returns ---------- Tensor - Auxiliary feature batch (B, C, T'), where - T = (T' - 2 * aux_context_window) * hop_size. + Input signal batch (B, 1, T). Tensor Target signal batch (B, 1, T). + Tensor + Auxiliary feature batch (B, C, T'), where + T = (T' - 2 * aux_context_window) * hop_size. """ # check length diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py index f30879ed6c72dfde9ee4874c87117c740f8ed364..fcf39a482825133ca1669da5882700f36ddb5cf7 100644 --- a/paddlespeech/t2s/models/wavernn/wavernn.py +++ b/paddlespeech/t2s/models/wavernn/wavernn.py @@ -20,7 +20,7 @@ import paddle from paddle import nn from paddle.nn import functional as F -from paddlespeech.t2s.datasets.vocoder_batch_fn import decode_mu_law +from paddlespeech.t2s.audio.codec import decode_mu_law from paddlespeech.t2s.modules.losses import sample_from_discretized_mix_logistic from paddlespeech.t2s.modules.nets_utils import initialize from paddlespeech.t2s.modules.upsample import Stretch2D @@ -28,7 +28,7 @@ from paddlespeech.t2s.modules.upsample import Stretch2D class ResBlock(nn.Layer): def __init__(self, dims): - super(ResBlock, self).__init__() + super().__init__() self.conv1 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False) self.conv2 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False) self.batch_norm1 = nn.BatchNorm1D(dims) @@ -205,7 +205,7 @@ class WaveRNN(nn.Layer): if self.mode == 'RAW': self.n_classes = 2**bits elif self.mode == 'MOL': - self.n_classes = 30 + self.n_classes = 10 * 3 else: RuntimeError('Unknown model mode value - ', self.mode) @@ -333,7 +333,7 @@ class WaveRNN(nn.Layer): # (T, C_aux) -> (1, C_aux, T) c = paddle.transpose(c, [1, 0]).unsqueeze(0) T = paddle.shape(c)[-1] - wave_len = (T - 1) * self.hop_length + wave_len = T * self.hop_length # TODO remove two transpose op by modifying function pad_tensor c = self.pad_tensor( c.transpose([0, 2, 1]), pad=self.aux_context_window, @@ -396,6 +396,8 @@ class WaveRNN(nn.Layer): posterior = F.softmax(logits, axis=1) distrib = paddle.distribution.Categorical(posterior) # corresponding operate [np.floor((fx + 1) / 2 * mu + 0.5)] in enocde_mu_law + # distrib.sample([1])[0].cast('float32'): [0, 2**bits-1] + # sample: [-1, 1] sample = 2 * distrib.sample([1])[0].cast('float32') / ( self.n_classes - 1.) - 1. output.append(sample) @@ -418,9 +420,9 @@ class WaveRNN(nn.Layer): output = output[0] # Fade-out at the end to avoid signal cutting out suddenly - fade_out = paddle.linspace(1, 0, 20 * self.hop_length) + fade_out = paddle.linspace(1, 0, 10 * self.hop_length) output = output[:wave_len] - output[-20 * self.hop_length:] *= fade_out + output[-10 * self.hop_length:] *= fade_out self.train()