# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np from paddle import fluid import paddle.fluid.dygraph as dg from hparams import hparams, hparams_debug_string from parakeet import g2p as frontend from deepvoice3 import DeepVoiceTTS def dry_run(model): """ Run the model once, just to get it initialized. """ model.train() _frontend = getattr(frontend, hparams.frontend) batch_size = 4 enc_length = 157 snd_sample_length = 500 r = hparams.outputs_per_step downsample_step = hparams.downsample_step n_speakers = hparams.n_speakers # make sure snd_sample_length can be divided by r * downsample_step linear_shift = r * downsample_step snd_sample_length += linear_shift - snd_sample_length % linear_shift decoder_length = snd_sample_length // downsample_step // r mel_length = snd_sample_length // downsample_step n_vocab = _frontend.n_vocab max_pos = hparams.max_positions spker_embed = hparams.speaker_embed_dim linear_dim = model.linear_dim mel_dim = hparams.num_mels x = np.random.randint( low=0, high=n_vocab, size=(batch_size, enc_length, 1), dtype="int64") input_lengths = np.arange( enc_length - batch_size + 1, enc_length + 1, dtype="int64") mel = np.random.randn(batch_size, mel_dim, 1, mel_length).astype("float32") y = np.random.randn(batch_size, linear_dim, 1, snd_sample_length).astype("float32") text_positions = np.tile( np.arange( 0, enc_length, dtype="int64"), (batch_size, 1)) text_mask = text_positions > np.expand_dims(input_lengths, 1) text_positions[text_mask] = 0 text_positions = np.expand_dims(text_positions, axis=-1) frame_positions = np.tile( np.arange( 1, decoder_length + 1, dtype="int64"), (batch_size, 1)) frame_positions = np.expand_dims(frame_positions, axis=-1) done = np.zeros(shape=(batch_size, 1, 1, decoder_length), dtype="float32") target_lengths = np.array([snd_sample_length] * batch_size).astype("int64") speaker_ids = np.random.randint( low=0, high=n_speakers, size=(batch_size, 1), dtype="int64") if n_speakers > 1 else None ismultispeaker = speaker_ids is not None x = dg.to_variable(x) input_lengths = dg.to_variable(input_lengths) mel = dg.to_variable(mel) y = dg.to_variable(y) text_positions = dg.to_variable(text_positions) frame_positions = dg.to_variable(frame_positions) done = dg.to_variable(done) target_lengths = dg.to_variable(target_lengths) speaker_ids = dg.to_variable( speaker_ids) if speaker_ids is not None else None # these two fields are used as numpy ndarray text_lengths = input_lengths.numpy() decoder_lengths = target_lengths.numpy() // r // downsample_step max_seq_len = max(text_lengths.max(), decoder_lengths.max()) if max_seq_len >= hparams.max_positions: raise RuntimeError( "max_seq_len ({}) >= max_posision ({})\n" "Input text or decoder targget length exceeded the maximum length.\n" "Please set a larger value for ``max_position`` in hyper parameters." .format(max_seq_len, hparams.max_positions)) # cause paddle's embedding layer expect shape[-1] == 1 # first dry run runs the whole model mel_outputs, linear_outputs, attn, done_hat = model( x, input_lengths, mel, speaker_ids, text_positions, frame_positions) num_parameters = 0 for k, v in model.state_dict().items(): print("{}|{}|{}".format(k, v.shape, np.prod(v.shape))) num_parameters += np.prod(v.shape) print("now model has {} parameters".format(len(model.state_dict()))) print("now model has {} elements".format(num_parameters))