提交 9699c007 编写于 作者: 小湉湉's avatar 小湉湉

change the docstring style from numpydoc to google, test=tts

上级 683679be
...@@ -22,25 +22,16 @@ from paddle.io import Dataset ...@@ -22,25 +22,16 @@ from paddle.io import Dataset
class DataTable(Dataset): class DataTable(Dataset):
"""Dataset to load and convert data for general purpose. """Dataset to load and convert data for general purpose.
Args:
Parameters data (List[Dict[str, Any]]): Metadata, a list of meta datum, each of which is composed of several fields
---------- fields (List[str], optional): Fields to use, if not specified, all the fields in the data are used, by default None
data : List[Dict[str, Any]] converters (Dict[str, Callable], optional): Converters used to process each field, by default None
Metadata, a list of meta datum, each of which is composed of use_cache (bool, optional): Whether to use cache, by default False
several fields
fields : List[str], optional Raises:
Fields to use, if not specified, all the fields in the data are ValueError:
used, by default None
converters : Dict[str, Callable], optional
Converters used to process each field, by default None
use_cache : bool, optional
Whether to use cache, by default False
Raises
------
ValueError
If there is some field that does not exist in data. If there is some field that does not exist in data.
ValueError ValueError:
If there is some field in converters that does not exist in fields. If there is some field in converters that does not exist in fields.
""" """
...@@ -95,15 +86,11 @@ class DataTable(Dataset): ...@@ -95,15 +86,11 @@ class DataTable(Dataset):
"""Convert a meta datum to an example by applying the corresponding """Convert a meta datum to an example by applying the corresponding
converters to each fields requested. converters to each fields requested.
Parameters Args:
---------- meta_datum (Dict[str, Any]): Meta datum
meta_datum : Dict[str, Any]
Meta datum
Returns Returns:
------- Dict[str, Any]: Converted example
Dict[str, Any]
Converted example
""" """
example = {} example = {}
for field in self.fields: for field in self.fields:
...@@ -118,16 +105,11 @@ class DataTable(Dataset): ...@@ -118,16 +105,11 @@ class DataTable(Dataset):
def __getitem__(self, idx: int) -> Dict[str, Any]: def __getitem__(self, idx: int) -> Dict[str, Any]:
"""Get an example given an index. """Get an example given an index.
Args:
idx (int): Index of the example to get
Parameters Returns:
---------- Dict[str, Any]: A converted example
idx : int
Index of the example to get
Returns
-------
Dict[str, Any]
A converted example
""" """
if self.use_cache and self.caches[idx] is not None: if self.use_cache and self.caches[idx] is not None:
return self.caches[idx] return self.caches[idx]
......
...@@ -18,14 +18,10 @@ import re ...@@ -18,14 +18,10 @@ import re
def get_phn_dur(file_name): def get_phn_dur(file_name):
''' '''
read MFA duration.txt read MFA duration.txt
Parameters Args:
---------- file_name (str or Path): path of gen_duration_from_textgrid.py's result
file_name : str or Path Returns:
path of gen_duration_from_textgrid.py's result Dict: sentence: {'utt': ([char], [int])}
Returns
----------
Dict
sentence: {'utt': ([char], [int])}
''' '''
f = open(file_name, 'r') f = open(file_name, 'r')
sentence = {} sentence = {}
...@@ -48,10 +44,8 @@ def get_phn_dur(file_name): ...@@ -48,10 +44,8 @@ def get_phn_dur(file_name):
def merge_silence(sentence): def merge_silence(sentence):
''' '''
merge silences merge silences
Parameters Args:
---------- sentence (Dict): sentence: {'utt': (([char], [int]), str)}
sentence : Dict
sentence: {'utt': (([char], [int]), str)}
''' '''
for utt in sentence: for utt in sentence:
cur_phn, cur_dur, speaker = sentence[utt] cur_phn, cur_dur, speaker = sentence[utt]
...@@ -81,12 +75,9 @@ def merge_silence(sentence): ...@@ -81,12 +75,9 @@ def merge_silence(sentence):
def get_input_token(sentence, output_path, dataset="baker"): def get_input_token(sentence, output_path, dataset="baker"):
''' '''
get phone set from training data and save it get phone set from training data and save it
Parameters Args:
---------- sentence (Dict): sentence: {'utt': ([char], [int])}
sentence : Dict output_path (str or path):path to save phone_id_map
sentence: {'utt': ([char], [int])}
output_path : str or path
path to save phone_id_map
''' '''
phn_token = set() phn_token = set()
for utt in sentence: for utt in sentence:
...@@ -112,14 +103,10 @@ def get_phones_tones(sentence, ...@@ -112,14 +103,10 @@ def get_phones_tones(sentence,
dataset="baker"): dataset="baker"):
''' '''
get phone set and tone set from training data and save it get phone set and tone set from training data and save it
Parameters Args:
---------- sentence (Dict): sentence: {'utt': ([char], [int])}
sentence : Dict phones_output_path (str or path): path to save phone_id_map
sentence: {'utt': ([char], [int])} tones_output_path (str or path): path to save tone_id_map
phones_output_path : str or path
path to save phone_id_map
tones_output_path : str or path
path to save tone_id_map
''' '''
phn_token = set() phn_token = set()
tone_token = set() tone_token = set()
...@@ -162,14 +149,10 @@ def get_spk_id_map(speaker_set, output_path): ...@@ -162,14 +149,10 @@ def get_spk_id_map(speaker_set, output_path):
def compare_duration_and_mel_length(sentences, utt, mel): def compare_duration_and_mel_length(sentences, utt, mel):
''' '''
check duration error, correct sentences[utt] if possible, else pop sentences[utt] check duration error, correct sentences[utt] if possible, else pop sentences[utt]
Parameters Args:
---------- sentences (Dict): sentences[utt] = [phones_list ,durations_list]
sentences : Dict utt (str): utt_id
sentences[utt] = [phones_list ,durations_list] mel (np.ndarry): features (num_frames, n_mels)
utt : str
utt_id
mel : np.ndarry
features (num_frames, n_mels)
''' '''
if utt in sentences: if utt in sentences:
......
...@@ -29,15 +29,11 @@ class Clip(object): ...@@ -29,15 +29,11 @@ class Clip(object):
hop_size=256, hop_size=256,
aux_context_window=0, ): aux_context_window=0, ):
"""Initialize customized collater for DataLoader. """Initialize customized collater for DataLoader.
Args:
Parameters batch_max_steps (int): The maximum length of input signal in batch.
---------- hop_size (int): Hop size of auxiliary features.
batch_max_steps : int aux_context_window (int): Context window size for auxiliary feature conv.
The maximum length of input signal in batch.
hop_size : int
Hop size of auxiliary features.
aux_context_window : int
Context window size for auxiliary feature conv.
""" """
if batch_max_steps % hop_size != 0: if batch_max_steps % hop_size != 0:
...@@ -56,17 +52,14 @@ class Clip(object): ...@@ -56,17 +52,14 @@ class Clip(object):
def __call__(self, batch): def __call__(self, batch):
"""Convert into batch tensors. """Convert into batch tensors.
Parameters Args:
---------- batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
batch : list
list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
Returns Returns:
---------- Tensor:
Tensor
Auxiliary feature batch (B, C, T'), where Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size. T = (T' - 2 * aux_context_window) * hop_size.
Tensor Tensor:
Target signal batch (B, 1, T). Target signal batch (B, 1, T).
""" """
...@@ -104,8 +97,7 @@ class Clip(object): ...@@ -104,8 +97,7 @@ class Clip(object):
def _adjust_length(self, x, c): def _adjust_length(self, x, c):
"""Adjust the audio and feature lengths. """Adjust the audio and feature lengths.
Note Note:
-------
Basically we assume that the length of x and c are adjusted Basically we assume that the length of x and c are adjusted
through preprocessing stage, but if we use other library processed through preprocessing stage, but if we use other library processed
features, this process will be needed. features, this process will be needed.
...@@ -162,22 +154,14 @@ class WaveRNNClip(Clip): ...@@ -162,22 +154,14 @@ class WaveRNNClip(Clip):
# voc_pad = 2 this will pad the input so that the resnet can 'see' wider than input length # voc_pad = 2 this will pad the input so that the resnet can 'see' wider than input length
# max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15 # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15
"""Convert into batch tensors. """Convert into batch tensors.
Args:
Parameters batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
----------
batch : list Returns:
list of tuple of the pair of audio and features. Tensor: Input signal batch (B, 1, T).
Audio shape (T, ), features shape(T', C). Tensor: Target signal batch (B, 1, T).
Tensor: Auxiliary feature batch (B, C, T'),
Returns where T = (T' - 2 * aux_context_window) * hop_size.
----------
Tensor
Input signal batch (B, 1, T).
Tensor
Target signal batch (B, 1, T).
Tensor
Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
""" """
# check length # check length
......
...@@ -31,15 +31,12 @@ from paddlespeech.t2s.frontend import English ...@@ -31,15 +31,12 @@ from paddlespeech.t2s.frontend import English
def get_lj_sentences(file_name, frontend): def get_lj_sentences(file_name, frontend):
''' '''read MFA duration.txt
read MFA duration.txt
Parameters Args:
---------- file_name (str or Path)
file_name : str or Path Returns:
Returns Dict: sentence: {'utt': ([char], [int])}
----------
Dict
sentence: {'utt': ([char], [int])}
''' '''
f = open(file_name, 'r') f = open(file_name, 'r')
sentence = {} sentence = {}
...@@ -59,14 +56,11 @@ def get_lj_sentences(file_name, frontend): ...@@ -59,14 +56,11 @@ def get_lj_sentences(file_name, frontend):
def get_input_token(sentence, output_path): def get_input_token(sentence, output_path):
''' '''get phone set from training data and save it
get phone set from training data and save it
Parameters Args:
---------- sentence (Dict): sentence: {'utt': ([char], str)}
sentence : Dict output_path (str or path): path to save phone_id_map
sentence: {'utt': ([char], str)}
output_path : str or path
path to save phone_id_map
''' '''
phn_token = set() phn_token = set()
for utt in sentence: for utt in sentence:
......
...@@ -133,16 +133,11 @@ class ARPABET(Phonetics): ...@@ -133,16 +133,11 @@ class ARPABET(Phonetics):
def phoneticize(self, sentence, add_start_end=False): def phoneticize(self, sentence, add_start_end=False):
""" Normalize the input text sequence and convert it into pronunciation sequence. """ Normalize the input text sequence and convert it into pronunciation sequence.
Args:
sentence (str): The input text sequence.
Parameters Returns:
----------- List[str]: The list of pronunciation sequence.
sentence: str
The input text sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
""" """
phonemes = [ phonemes = [
self._remove_vowels(item) for item in self.backend(sentence) self._remove_vowels(item) for item in self.backend(sentence)
...@@ -157,15 +152,11 @@ class ARPABET(Phonetics): ...@@ -157,15 +152,11 @@ class ARPABET(Phonetics):
def numericalize(self, phonemes): def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence. """ Convert pronunciation sequence into pronunciation id sequence.
Parameters Args:
----------- phonemes (List[str]): The list of pronunciation sequence.
phonemes: List[str]
The list of pronunciation sequence.
Returns Returns:
---------- List[int]: The list of pronunciation id sequence.
List[int]
The list of pronunciation id sequence.
""" """
ids = [self.vocab.lookup(item) for item in phonemes] ids = [self.vocab.lookup(item) for item in phonemes]
return ids return ids
...@@ -173,14 +164,11 @@ class ARPABET(Phonetics): ...@@ -173,14 +164,11 @@ class ARPABET(Phonetics):
def reverse(self, ids): def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence. """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
Parameters Args:
----------- ids( List[int]): The list of pronunciation id sequence.
ids: List[int]
The list of pronunciation id sequence.
Returns Returns:
---------- List[str]:
List[str]
The list of pronunciation sequence. The list of pronunciation sequence.
""" """
return [self.vocab.reverse(i) for i in ids] return [self.vocab.reverse(i) for i in ids]
...@@ -188,15 +176,11 @@ class ARPABET(Phonetics): ...@@ -188,15 +176,11 @@ class ARPABET(Phonetics):
def __call__(self, sentence, add_start_end=False): def __call__(self, sentence, add_start_end=False):
""" Convert the input text sequence into pronunciation id sequence. """ Convert the input text sequence into pronunciation id sequence.
Parameters Args:
----------- sentence (str): The input text sequence.
sentence: str
The input text sequence.
Returns Returns:
---------- List[str]: The list of pronunciation id sequence.
List[str]
The list of pronunciation id sequence.
""" """
return self.numericalize( return self.numericalize(
self.phoneticize(sentence, add_start_end=add_start_end)) self.phoneticize(sentence, add_start_end=add_start_end))
...@@ -229,15 +213,11 @@ class ARPABETWithStress(Phonetics): ...@@ -229,15 +213,11 @@ class ARPABETWithStress(Phonetics):
def phoneticize(self, sentence, add_start_end=False): def phoneticize(self, sentence, add_start_end=False):
""" Normalize the input text sequence and convert it into pronunciation sequence. """ Normalize the input text sequence and convert it into pronunciation sequence.
Parameters Args:
----------- sentence (str): The input text sequence.
sentence: str
The input text sequence.
Returns Returns:
---------- List[str]: The list of pronunciation sequence.
List[str]
The list of pronunciation sequence.
""" """
phonemes = self.backend(sentence) phonemes = self.backend(sentence)
if add_start_end: if add_start_end:
...@@ -250,46 +230,32 @@ class ARPABETWithStress(Phonetics): ...@@ -250,46 +230,32 @@ class ARPABETWithStress(Phonetics):
def numericalize(self, phonemes): def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence. """ Convert pronunciation sequence into pronunciation id sequence.
Parameters Args:
----------- phonemes (List[str]): The list of pronunciation sequence.
phonemes: List[str]
The list of pronunciation sequence.
Returns Returns:
---------- List[int]: The list of pronunciation id sequence.
List[int]
The list of pronunciation id sequence.
""" """
ids = [self.vocab.lookup(item) for item in phonemes] ids = [self.vocab.lookup(item) for item in phonemes]
return ids return ids
def reverse(self, ids): def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence. """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
Args:
ids (List[int]): The list of pronunciation id sequence.
Parameters Returns:
----------- List[str]: The list of pronunciation sequence.
ids: List[int]
The list of pronunciation id sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
""" """
return [self.vocab.reverse(i) for i in ids] return [self.vocab.reverse(i) for i in ids]
def __call__(self, sentence, add_start_end=False): def __call__(self, sentence, add_start_end=False):
""" Convert the input text sequence into pronunciation id sequence. """ Convert the input text sequence into pronunciation id sequence.
Args:
sentence (str): The input text sequence.
Parameters Returns:
----------- List[str]: The list of pronunciation id sequence.
sentence: str
The input text sequence.
Returns
----------
List[str]
The list of pronunciation id sequence.
""" """
return self.numericalize( return self.numericalize(
self.phoneticize(sentence, add_start_end=add_start_end)) self.phoneticize(sentence, add_start_end=add_start_end))
......
...@@ -65,14 +65,10 @@ class English(Phonetics): ...@@ -65,14 +65,10 @@ class English(Phonetics):
def phoneticize(self, sentence): def phoneticize(self, sentence):
""" Normalize the input text sequence and convert it into pronunciation sequence. """ Normalize the input text sequence and convert it into pronunciation sequence.
Parameters Args:
----------- sentence (str): The input text sequence.
sentence: str Returns:
The input text sequence. List[str]: The list of pronunciation sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
""" """
start = self.vocab.start_symbol start = self.vocab.start_symbol
end = self.vocab.end_symbol end = self.vocab.end_symbol
...@@ -123,14 +119,10 @@ class English(Phonetics): ...@@ -123,14 +119,10 @@ class English(Phonetics):
def numericalize(self, phonemes): def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence. """ Convert pronunciation sequence into pronunciation id sequence.
Parameters Args:
----------- phonemes (List[str]): The list of pronunciation sequence.
phonemes: List[str] Returns:
The list of pronunciation sequence. List[int]: The list of pronunciation id sequence.
Returns
----------
List[int]
The list of pronunciation id sequence.
""" """
ids = [ ids = [
self.vocab.lookup(item) for item in phonemes self.vocab.lookup(item) for item in phonemes
...@@ -140,27 +132,19 @@ class English(Phonetics): ...@@ -140,27 +132,19 @@ class English(Phonetics):
def reverse(self, ids): def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence. """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
Parameters Args:
----------- ids (List[int]): The list of pronunciation id sequence.
ids: List[int] Returns:
The list of pronunciation id sequence. List[str]: The list of pronunciation sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
""" """
return [self.vocab.reverse(i) for i in ids] return [self.vocab.reverse(i) for i in ids]
def __call__(self, sentence): def __call__(self, sentence):
""" Convert the input text sequence into pronunciation id sequence. """ Convert the input text sequence into pronunciation id sequence.
Parameters Args:
----------- sentence(str): The input text sequence.
sentence: str Returns:
The input text sequence. List[str]: The list of pronunciation id sequence.
Returns
----------
List[str]
The list of pronunciation id sequence.
""" """
return self.numericalize(self.phoneticize(sentence)) return self.numericalize(self.phoneticize(sentence))
...@@ -183,27 +167,20 @@ class EnglishCharacter(Phonetics): ...@@ -183,27 +167,20 @@ class EnglishCharacter(Phonetics):
def phoneticize(self, sentence): def phoneticize(self, sentence):
""" Normalize the input text sequence. """ Normalize the input text sequence.
Parameters Args:
----------- sentence(str): The input text sequence.
sentence: str Returns:
The input text sequence. str: A text sequence after normalize.
Returns
----------
str
A text sequence after normalize.
""" """
words = normalize(sentence) words = normalize(sentence)
return words return words
def numericalize(self, sentence): def numericalize(self, sentence):
""" Convert a text sequence into ids. """ Convert a text sequence into ids.
Parameters Args:
----------- sentence (str): The input text sequence.
sentence: str Returns:
The input text sequence. List[int]:
Returns
----------
List[int]
List of a character id sequence. List of a character id sequence.
""" """
ids = [ ids = [
...@@ -214,27 +191,19 @@ class EnglishCharacter(Phonetics): ...@@ -214,27 +191,19 @@ class EnglishCharacter(Phonetics):
def reverse(self, ids): def reverse(self, ids):
""" Convert a character id sequence into text. """ Convert a character id sequence into text.
Parameters Args:
----------- ids (List[int]): List of a character id sequence.
ids: List[int] Returns:
List of a character id sequence. str: The input text sequence.
Returns
----------
str
The input text sequence.
""" """
return [self.vocab.reverse(i) for i in ids] return [self.vocab.reverse(i) for i in ids]
def __call__(self, sentence): def __call__(self, sentence):
""" Normalize the input text sequence and convert it into character id sequence. """ Normalize the input text sequence and convert it into character id sequence.
Parameters Args:
----------- sentence (str): The input text sequence.
sentence: str Returns:
The input text sequence. List[int]: List of a character id sequence.
Returns
----------
List[int]
List of a character id sequence.
""" """
return self.numericalize(self.phoneticize(sentence)) return self.numericalize(self.phoneticize(sentence))
...@@ -264,14 +233,10 @@ class Chinese(Phonetics): ...@@ -264,14 +233,10 @@ class Chinese(Phonetics):
def phoneticize(self, sentence): def phoneticize(self, sentence):
""" Normalize the input text sequence and convert it into pronunciation sequence. """ Normalize the input text sequence and convert it into pronunciation sequence.
Parameters Args:
----------- sentence(str): The input text sequence.
sentence: str Returns:
The input text sequence. List[str]: The list of pronunciation sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
""" """
# simplified = self.opencc_backend.convert(sentence) # simplified = self.opencc_backend.convert(sentence)
simplified = sentence simplified = sentence
...@@ -296,28 +261,20 @@ class Chinese(Phonetics): ...@@ -296,28 +261,20 @@ class Chinese(Phonetics):
def numericalize(self, phonemes): def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence. """ Convert pronunciation sequence into pronunciation id sequence.
Parameters Args:
----------- phonemes(List[str]): The list of pronunciation sequence.
phonemes: List[str] Returns:
The list of pronunciation sequence. List[int]: The list of pronunciation id sequence.
Returns
----------
List[int]
The list of pronunciation id sequence.
""" """
ids = [self.vocab.lookup(item) for item in phonemes] ids = [self.vocab.lookup(item) for item in phonemes]
return ids return ids
def __call__(self, sentence): def __call__(self, sentence):
""" Convert the input text sequence into pronunciation id sequence. """ Convert the input text sequence into pronunciation id sequence.
Parameters Args:
----------- sentence (str): The input text sequence.
sentence: str Returns:
The input text sequence. List[str]: The list of pronunciation id sequence.
Returns
----------
List[str]
The list of pronunciation id sequence.
""" """
return self.numericalize(self.phoneticize(sentence)) return self.numericalize(self.phoneticize(sentence))
...@@ -329,13 +286,9 @@ class Chinese(Phonetics): ...@@ -329,13 +286,9 @@ class Chinese(Phonetics):
def reverse(self, ids): def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence. """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
Parameters Args:
----------- ids (List[int]): The list of pronunciation id sequence.
ids: List[int] Returns:
The list of pronunciation id sequence. List[str]: The list of pronunciation sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
""" """
return [self.vocab.reverse(i) for i in ids] return [self.vocab.reverse(i) for i in ids]
...@@ -20,22 +20,12 @@ __all__ = ["Vocab"] ...@@ -20,22 +20,12 @@ __all__ = ["Vocab"]
class Vocab(object): class Vocab(object):
""" Vocabulary. """ Vocabulary.
Parameters Args:
----------- symbols (Iterable[str]): Common symbols.
symbols: Iterable[str] padding_symbol (str, optional): Symbol for pad. Defaults to "<pad>".
Common symbols. unk_symbol (str, optional): Symbol for unknow. Defaults to "<unk>"
start_symbol (str, optional): Symbol for start. Defaults to "<s>"
padding_symbol: str, optional end_symbol (str, optional): Symbol for end. Defaults to "</s>"
Symbol for pad. Defaults to "<pad>".
unk_symbol: str, optional
Symbol for unknow. Defaults to "<unk>"
start_symbol: str, optional
Symbol for start. Defaults to "<s>"
end_symbol: str, optional
Symbol for end. Defaults to "</s>"
""" """
def __init__(self, def __init__(self,
......
...@@ -44,11 +44,9 @@ RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' ...@@ -44,11 +44,9 @@ RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
def replace_time(match) -> str: def replace_time(match) -> str:
""" """
Parameters Args:
---------- match (re.Match)
match : re.Match Returns:
Returns
----------
str str
""" """
...@@ -87,11 +85,9 @@ RE_DATE = re.compile(r'(\d{4}|\d{2})年' ...@@ -87,11 +85,9 @@ RE_DATE = re.compile(r'(\d{4}|\d{2})年'
def replace_date(match) -> str: def replace_date(match) -> str:
""" """
Parameters Args:
---------- match (re.Match)
match : re.Match Returns:
Returns
----------
str str
""" """
year = match.group(1) year = match.group(1)
...@@ -114,11 +110,9 @@ RE_DATE2 = re.compile( ...@@ -114,11 +110,9 @@ RE_DATE2 = re.compile(
def replace_date2(match) -> str: def replace_date2(match) -> str:
""" """
Parameters Args:
---------- match (re.Match)
match : re.Match Returns:
Returns
----------
str str
""" """
year = match.group(1) year = match.group(1)
......
...@@ -36,11 +36,9 @@ RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') ...@@ -36,11 +36,9 @@ RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
def replace_frac(match) -> str: def replace_frac(match) -> str:
""" """
Parameters Args:
---------- match (re.Match)
match : re.Match Returns:
Returns
----------
str str
""" """
sign = match.group(1) sign = match.group(1)
...@@ -59,11 +57,9 @@ RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%') ...@@ -59,11 +57,9 @@ RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
def replace_percentage(match) -> str: def replace_percentage(match) -> str:
""" """
Parameters Args:
---------- match (re.Match)
match : re.Match Returns:
Returns
----------
str str
""" """
sign = match.group(1) sign = match.group(1)
...@@ -81,11 +77,9 @@ RE_INTEGER = re.compile(r'(-)' r'(\d+)') ...@@ -81,11 +77,9 @@ RE_INTEGER = re.compile(r'(-)' r'(\d+)')
def replace_negative_num(match) -> str: def replace_negative_num(match) -> str:
""" """
Parameters Args:
---------- match (re.Match)
match : re.Match Returns:
Returns
----------
str str
""" """
sign = match.group(1) sign = match.group(1)
...@@ -103,11 +97,9 @@ RE_DEFAULT_NUM = re.compile(r'\d{3}\d*') ...@@ -103,11 +97,9 @@ RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
def replace_default_num(match): def replace_default_num(match):
""" """
Parameters Args:
---------- match (re.Match)
match : re.Match Returns:
Returns
----------
str str
""" """
number = match.group(0) number = match.group(0)
...@@ -124,11 +116,9 @@ RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') ...@@ -124,11 +116,9 @@ RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
def replace_positive_quantifier(match) -> str: def replace_positive_quantifier(match) -> str:
""" """
Parameters Args:
---------- match (re.Match)
match : re.Match Returns:
Returns
----------
str str
""" """
number = match.group(1) number = match.group(1)
...@@ -142,11 +132,9 @@ def replace_positive_quantifier(match) -> str: ...@@ -142,11 +132,9 @@ def replace_positive_quantifier(match) -> str:
def replace_number(match) -> str: def replace_number(match) -> str:
""" """
Parameters Args:
---------- match (re.Match)
match : re.Match Returns:
Returns
----------
str str
""" """
sign = match.group(1) sign = match.group(1)
...@@ -169,11 +157,9 @@ RE_RANGE = re.compile( ...@@ -169,11 +157,9 @@ RE_RANGE = re.compile(
def replace_range(match) -> str: def replace_range(match) -> str:
""" """
Parameters Args:
---------- match (re.Match)
match : re.Match Returns:
Returns
----------
str str
""" """
first, second = match.group(1), match.group(8) first, second = match.group(1), match.group(8)
......
...@@ -45,11 +45,9 @@ def phone2str(phone_string: str, mobile=True) -> str: ...@@ -45,11 +45,9 @@ def phone2str(phone_string: str, mobile=True) -> str:
def replace_phone(match) -> str: def replace_phone(match) -> str:
""" """
Parameters Args:
---------- match (re.Match)
match : re.Match Returns:
Returns
----------
str str
""" """
return phone2str(match.group(0), mobile=False) return phone2str(match.group(0), mobile=False)
...@@ -57,11 +55,9 @@ def replace_phone(match) -> str: ...@@ -57,11 +55,9 @@ def replace_phone(match) -> str:
def replace_mobile(match) -> str: def replace_mobile(match) -> str:
""" """
Parameters Args:
---------- match (re.Match)
match : re.Match Returns:
Returns
----------
str str
""" """
return phone2str(match.group(0)) return phone2str(match.group(0))
...@@ -22,11 +22,9 @@ RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') ...@@ -22,11 +22,9 @@ RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
def replace_temperature(match) -> str: def replace_temperature(match) -> str:
""" """
Parameters Args:
---------- match (re.Match)
match : re.Match Returns:
Returns
----------
str str
""" """
sign = match.group(1) sign = match.group(1)
......
...@@ -55,14 +55,10 @@ class TextNormalizer(): ...@@ -55,14 +55,10 @@ class TextNormalizer():
def _split(self, text: str, lang="zh") -> List[str]: def _split(self, text: str, lang="zh") -> List[str]:
"""Split long text into sentences with sentence-splitting punctuations. """Split long text into sentences with sentence-splitting punctuations.
Parameters Args:
---------- text (str): The input text.
text : str Returns:
The input text. List[str]: Sentences.
Returns
-------
List[str]
Sentences.
""" """
# Only for pure Chinese here # Only for pure Chinese here
if lang == "zh": if lang == "zh":
......
...@@ -37,34 +37,20 @@ class HiFiGANGenerator(nn.Layer): ...@@ -37,34 +37,20 @@ class HiFiGANGenerator(nn.Layer):
use_weight_norm: bool=True, use_weight_norm: bool=True,
init_type: str="xavier_uniform", ): init_type: str="xavier_uniform", ):
"""Initialize HiFiGANGenerator module. """Initialize HiFiGANGenerator module.
Parameters Args:
---------- in_channels (int): Number of input channels.
in_channels : int out_channels (int): Number of output channels.
Number of input channels. channels (int): Number of hidden representation channels.
out_channels : int kernel_size (int): Kernel size of initial and final conv layer.
Number of output channels. upsample_scales (list): List of upsampling scales.
channels : int upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
Number of hidden representation channels. resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
kernel_size : int resblock_dilations (list): List of dilation list for residual blocks.
Kernel size of initial and final conv layer. use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
upsample_scales : list bias (bool): Whether to add bias parameter in convolution layers.
List of upsampling scales. nonlinear_activation (str): Activation function module name.
upsample_kernel_sizes : list nonlinear_activation_params (dict): Hyperparameters for activation function.
List of kernel sizes for upsampling layers. use_weight_norm (bool): Whether to use weight norm.
resblock_kernel_sizes : list
List of kernel sizes for residual blocks.
resblock_dilations : list
List of dilation list for residual blocks.
use_additional_convs : bool
Whether to use additional conv layers in residual blocks.
bias : bool
Whether to add bias parameter in convolution layers.
nonlinear_activation : str
Activation function module name.
nonlinear_activation_params : dict
Hyperparameters for activation function.
use_weight_norm : bool
Whether to use weight norm.
If set to true, it will be applied to all of the conv layers. If set to true, it will be applied to all of the conv layers.
""" """
super().__init__() super().__init__()
...@@ -134,14 +120,11 @@ class HiFiGANGenerator(nn.Layer): ...@@ -134,14 +120,11 @@ class HiFiGANGenerator(nn.Layer):
def forward(self, c): def forward(self, c):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters
---------- Args:
c : Tensor c (Tensor): Input tensor (B, in_channels, T).
Input tensor (B, in_channels, T). Returns:
Returns Tensor: Output tensor (B, out_channels, T).
----------
Tensor
Output tensor (B, out_channels, T).
""" """
c = self.input_conv(c) c = self.input_conv(c)
for i in range(self.num_upsamples): for i in range(self.num_upsamples):
...@@ -196,14 +179,11 @@ class HiFiGANGenerator(nn.Layer): ...@@ -196,14 +179,11 @@ class HiFiGANGenerator(nn.Layer):
def inference(self, c): def inference(self, c):
"""Perform inference. """Perform inference.
Parameters Args:
---------- c (Tensor): Input tensor (T, in_channels).
c : Tensor
Input tensor (T, in_channels).
normalize_before (bool): Whether to perform normalization. normalize_before (bool): Whether to perform normalization.
Returns Returns:
---------- Tensor:
Tensor
Output tensor (T ** prod(upsample_scales), out_channels). Output tensor (T ** prod(upsample_scales), out_channels).
""" """
c = self.forward(c.transpose([1, 0]).unsqueeze(0)) c = self.forward(c.transpose([1, 0]).unsqueeze(0))
...@@ -229,35 +209,22 @@ class HiFiGANPeriodDiscriminator(nn.Layer): ...@@ -229,35 +209,22 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
use_spectral_norm: bool=False, use_spectral_norm: bool=False,
init_type: str="xavier_uniform", ): init_type: str="xavier_uniform", ):
"""Initialize HiFiGANPeriodDiscriminator module. """Initialize HiFiGANPeriodDiscriminator module.
Parameters
---------- Args:
in_channels : int in_channels (int): Number of input channels.
Number of input channels. out_channels (int): Number of output channels.
out_channels : int period (int): Period.
Number of output channels. kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer.
period : int channels (int): Number of initial channels.
Period. downsample_scales (list): List of downsampling scales.
kernel_sizes : list max_downsample_channels (int): Number of maximum downsampling channels.
Kernel sizes of initial conv layers and the final conv layer. use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
channels : int bias (bool): Whether to add bias parameter in convolution layers.
Number of initial channels. nonlinear_activation (str): Activation function module name.
downsample_scales : list nonlinear_activation_params (dict): Hyperparameters for activation function.
List of downsampling scales. use_weight_norm (bool): Whether to use weight norm.
max_downsample_channels : int
Number of maximum downsampling channels.
use_additional_convs : bool
Whether to use additional conv layers in residual blocks.
bias : bool
Whether to add bias parameter in convolution layers.
nonlinear_activation : str
Activation function module name.
nonlinear_activation_params : dict
Hyperparameters for activation function.
use_weight_norm : bool
Whether to use weight norm.
If set to true, it will be applied to all of the conv layers. If set to true, it will be applied to all of the conv layers.
use_spectral_norm : bool use_spectral_norm (bool): Whether to use spectral norm.
Whether to use spectral norm.
If set to true, it will be applied to all of the conv layers. If set to true, it will be applied to all of the conv layers.
""" """
super().__init__() super().__init__()
...@@ -307,14 +274,11 @@ class HiFiGANPeriodDiscriminator(nn.Layer): ...@@ -307,14 +274,11 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
def forward(self, x): def forward(self, x):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters
---------- Args:
c : Tensor c (Tensor): Input tensor (B, in_channels, T).
Input tensor (B, in_channels, T). Returns:
Returns list: List of each layer's tensors.
----------
list
List of each layer's tensors.
""" """
# transform 1d to 2d -> (B, C, T/P, P) # transform 1d to 2d -> (B, C, T/P, P)
b, c, t = paddle.shape(x) b, c, t = paddle.shape(x)
...@@ -379,12 +343,10 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer): ...@@ -379,12 +343,10 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
}, },
init_type: str="xavier_uniform", ): init_type: str="xavier_uniform", ):
"""Initialize HiFiGANMultiPeriodDiscriminator module. """Initialize HiFiGANMultiPeriodDiscriminator module.
Parameters
---------- Args:
periods : list periods (list): List of periods.
List of periods. discriminator_params (dict): Parameters for hifi-gan period discriminator module.
discriminator_params : dict
Parameters for hifi-gan period discriminator module.
The period parameter will be overwritten. The period parameter will be overwritten.
""" """
super().__init__() super().__init__()
...@@ -399,14 +361,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer): ...@@ -399,14 +361,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
def forward(self, x): def forward(self, x):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters
---------- Args:
x : Tensor x (Tensor): Input noise signal (B, 1, T).
Input noise signal (B, 1, T). Returns:
Returns List: List of list of each discriminator outputs, which consists of each layer output tensors.
----------
List
List of list of each discriminator outputs, which consists of each layer output tensors.
""" """
outs = [] outs = []
for f in self.discriminators: for f in self.discriminators:
...@@ -434,32 +393,21 @@ class HiFiGANScaleDiscriminator(nn.Layer): ...@@ -434,32 +393,21 @@ class HiFiGANScaleDiscriminator(nn.Layer):
use_spectral_norm: bool=False, use_spectral_norm: bool=False,
init_type: str="xavier_uniform", ): init_type: str="xavier_uniform", ):
"""Initilize HiFiGAN scale discriminator module. """Initilize HiFiGAN scale discriminator module.
Parameters
---------- Args:
in_channels : int in_channels (int): Number of input channels.
Number of input channels. out_channels (int): Number of output channels.
out_channels : int kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer,
Number of output channels.
kernel_sizes : list
List of four kernel sizes. The first will be used for the first conv layer,
and the second is for downsampling part, and the remaining two are for output layers. and the second is for downsampling part, and the remaining two are for output layers.
channels : int channels (int): Initial number of channels for conv layer.
Initial number of channels for conv layer. max_downsample_channels (int): Maximum number of channels for downsampling layers.
max_downsample_channels : int bias (bool): Whether to add bias parameter in convolution layers.
Maximum number of channels for downsampling layers. downsample_scales (list): List of downsampling scales.
bias : bool nonlinear_activation (str): Activation function module name.
Whether to add bias parameter in convolution layers. nonlinear_activation_params (dict): Hyperparameters for activation function.
downsample_scales : list use_weight_norm (bool): Whether to use weight norm.
List of downsampling scales.
nonlinear_activation : str
Activation function module name.
nonlinear_activation_params : dict
Hyperparameters for activation function.
use_weight_norm : bool
Whether to use weight norm.
If set to true, it will be applied to all of the conv layers. If set to true, it will be applied to all of the conv layers.
use_spectral_norm : bool use_spectral_norm (bool): Whether to use spectral norm.
Whether to use spectral norm.
If set to true, it will be applied to all of the conv layers. If set to true, it will be applied to all of the conv layers.
""" """
super().__init__() super().__init__()
...@@ -546,14 +494,11 @@ class HiFiGANScaleDiscriminator(nn.Layer): ...@@ -546,14 +494,11 @@ class HiFiGANScaleDiscriminator(nn.Layer):
def forward(self, x): def forward(self, x):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters
---------- Args:
x : Tensor x (Tensor): Input noise signal (B, 1, T).
Input noise signal (B, 1, T). Returns:
Returns List: List of output tensors of each layer.
----------
List
List of output tensors of each layer.
""" """
outs = [] outs = []
for f in self.layers: for f in self.layers:
...@@ -613,20 +558,14 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer): ...@@ -613,20 +558,14 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):
follow_official_norm: bool=False, follow_official_norm: bool=False,
init_type: str="xavier_uniform", ): init_type: str="xavier_uniform", ):
"""Initilize HiFiGAN multi-scale discriminator module. """Initilize HiFiGAN multi-scale discriminator module.
Parameters
---------- Args:
scales : int scales (int): Number of multi-scales.
Number of multi-scales. downsample_pooling (str): Pooling module name for downsampling of the inputs.
downsample_pooling : str downsample_pooling_params (dict): Parameters for the above pooling module.
Pooling module name for downsampling of the inputs. discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
downsample_pooling_params : dict follow_official_norm (bool): Whether to follow the norm setting of the official
Parameters for the above pooling module. implementaion. The first discriminator uses spectral norm and the other discriminators use weight norm.
discriminator_params : dict
Parameters for hifi-gan scale discriminator module.
follow_official_norm : bool
Whether to follow the norm setting of the official
implementaion. The first discriminator uses spectral norm and the other
discriminators use weight norm.
""" """
super().__init__() super().__init__()
...@@ -651,14 +590,11 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer): ...@@ -651,14 +590,11 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):
def forward(self, x): def forward(self, x):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters
---------- Args:
x : Tensor x (Tensor): Input noise signal (B, 1, T).
Input noise signal (B, 1, T). Returns:
Returns List: List of list of each discriminator outputs, which consists of each layer output tensors.
----------
List
List of list of each discriminator outputs, which consists of each layer output tensors.
""" """
outs = [] outs = []
for f in self.discriminators: for f in self.discriminators:
...@@ -715,23 +651,16 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer): ...@@ -715,23 +651,16 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
}, },
init_type: str="xavier_uniform", ): init_type: str="xavier_uniform", ):
"""Initilize HiFiGAN multi-scale + multi-period discriminator module. """Initilize HiFiGAN multi-scale + multi-period discriminator module.
Parameters
---------- Args:
scales : int scales (int): Number of multi-scales.
Number of multi-scales. scale_downsample_pooling (str): Pooling module name for downsampling of the inputs.
scale_downsample_pooling : str scale_downsample_pooling_params (dict): Parameters for the above pooling module.
Pooling module name for downsampling of the inputs. scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
scale_downsample_pooling_params : dict follow_official_norm (bool): Whether to follow the norm setting of the official implementaion.
Parameters for the above pooling module. The first discriminator uses spectral norm and the other discriminators use weight norm.
scale_discriminator_params : dict periods (list): List of periods.
Parameters for hifi-gan scale discriminator module. period_discriminator_params (dict): Parameters for hifi-gan period discriminator module.
follow_official_norm : bool): Whether to follow the norm setting of the official
implementaion. The first discriminator uses spectral norm and the other
discriminators use weight norm.
periods : list
List of periods.
period_discriminator_params : dict
Parameters for hifi-gan period discriminator module.
The period parameter will be overwritten. The period parameter will be overwritten.
""" """
super().__init__() super().__init__()
...@@ -751,12 +680,10 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer): ...@@ -751,12 +680,10 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
def forward(self, x): def forward(self, x):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters
---------- Args:
x : Tensor x (Tensor): Input noise signal (B, 1, T).
Input noise signal (B, 1, T). Returns:
Returns
----------
List: List:
List of list of each discriminator outputs, List of list of each discriminator outputs,
which consists of each layer output tensors. which consists of each layer output tensors.
......
...@@ -51,41 +51,26 @@ class MelGANGenerator(nn.Layer): ...@@ -51,41 +51,26 @@ class MelGANGenerator(nn.Layer):
use_causal_conv: bool=False, use_causal_conv: bool=False,
init_type: str="xavier_uniform", ): init_type: str="xavier_uniform", ):
"""Initialize MelGANGenerator module. """Initialize MelGANGenerator module.
Parameters
---------- Args:
in_channels : int in_channels (int): Number of input channels.
Number of input channels. out_channels (int): Number of output channels,
out_channels : int
Number of output channels,
the number of sub-band is out_channels in multi-band melgan. the number of sub-band is out_channels in multi-band melgan.
kernel_size : int kernel_size (int): Kernel size of initial and final conv layer.
Kernel size of initial and final conv layer. channels (int): Initial number of channels for conv layer.
channels : int bias (bool): Whether to add bias parameter in convolution layers.
Initial number of channels for conv layer. upsample_scales (List[int]): List of upsampling scales.
bias : bool stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
Whether to add bias parameter in convolution layers. stacks (int): Number of stacks in a single residual stack.
upsample_scales : List[int] nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
List of upsampling scales. nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network,
stack_kernel_size : int
Kernel size of dilated conv layers in residual stack.
stacks : int
Number of stacks in a single residual stack.
nonlinear_activation : Optional[str], optional
Non linear activation in upsample network, by default None
nonlinear_activation_params : Dict[str, Any], optional
Parameters passed to the linear activation in the upsample network,
by default {} by default {}
pad : str pad (str): Padding function module name before dilated convolution layer.
Padding function module name before dilated convolution layer. pad_params (dict): Hyperparameters for padding function.
pad_params : dict use_final_nonlinear_activation (nn.Layer): Activation function for the final layer.
Hyperparameters for padding function. use_weight_norm (bool): Whether to use weight norm.
use_final_nonlinear_activation : nn.Layer
Activation function for the final layer.
use_weight_norm : bool
Whether to use weight norm.
If set to true, it will be applied to all of the conv layers. If set to true, it will be applied to all of the conv layers.
use_causal_conv : bool use_causal_conv (bool): Whether to use causal convolution.
Whether to use causal convolution.
""" """
super().__init__() super().__init__()
...@@ -207,14 +192,11 @@ class MelGANGenerator(nn.Layer): ...@@ -207,14 +192,11 @@ class MelGANGenerator(nn.Layer):
def forward(self, c): def forward(self, c):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters
---------- Args:
c : Tensor c (Tensor): Input tensor (B, in_channels, T).
Input tensor (B, in_channels, T). Returns:
Returns Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
----------
Tensor
Output tensor (B, out_channels, T ** prod(upsample_scales)).
""" """
out = self.melgan(c) out = self.melgan(c)
return out return out
...@@ -260,14 +242,11 @@ class MelGANGenerator(nn.Layer): ...@@ -260,14 +242,11 @@ class MelGANGenerator(nn.Layer):
def inference(self, c): def inference(self, c):
"""Perform inference. """Perform inference.
Parameters
---------- Args:
c : Union[Tensor, ndarray] c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
Input tensor (T, in_channels). Returns:
Returns Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1).
----------
Tensor
Output tensor (out_channels*T ** prod(upsample_scales), 1).
""" """
# pseudo batch # pseudo batch
c = c.transpose([1, 0]).unsqueeze(0) c = c.transpose([1, 0]).unsqueeze(0)
...@@ -298,33 +277,22 @@ class MelGANDiscriminator(nn.Layer): ...@@ -298,33 +277,22 @@ class MelGANDiscriminator(nn.Layer):
pad_params: Dict[str, Any]={"mode": "reflect"}, pad_params: Dict[str, Any]={"mode": "reflect"},
init_type: str="xavier_uniform", ): init_type: str="xavier_uniform", ):
"""Initilize MelGAN discriminator module. """Initilize MelGAN discriminator module.
Parameters
---------- Args:
in_channels : int in_channels (int): Number of input channels.
Number of input channels. out_channels (int): Number of output channels.
out_channels : int kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer,
Number of output channels.
kernel_sizes : List[int]
List of two kernel sizes. The prod will be used for the first conv layer,
and the first and the second kernel sizes will be used for the last two layers. and the first and the second kernel sizes will be used for the last two layers.
For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15, For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
the last two layers' kernel size will be 5 and 3, respectively. the last two layers' kernel size will be 5 and 3, respectively.
channels : int channels (int): Initial number of channels for conv layer.
Initial number of channels for conv layer. max_downsample_channels (int): Maximum number of channels for downsampling layers.
max_downsample_channels : int bias (bool): Whether to add bias parameter in convolution layers.
Maximum number of channels for downsampling layers. downsample_scales (List[int]): List of downsampling scales.
bias : bool nonlinear_activation (str): Activation function module name.
Whether to add bias parameter in convolution layers. nonlinear_activation_params (dict): Hyperparameters for activation function.
downsample_scales : List[int] pad (str): Padding function module name before dilated convolution layer.
List of downsampling scales. pad_params (dict): Hyperparameters for padding function.
nonlinear_activation : str
Activation function module name.
nonlinear_activation_params : dict
Hyperparameters for activation function.
pad : str
Padding function module name before dilated convolution layer.
pad_params : dict
Hyperparameters for padding function.
""" """
super().__init__() super().__init__()
...@@ -395,14 +363,10 @@ class MelGANDiscriminator(nn.Layer): ...@@ -395,14 +363,10 @@ class MelGANDiscriminator(nn.Layer):
def forward(self, x): def forward(self, x):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- x (Tensor): Input noise signal (B, 1, T).
x : Tensor Returns:
Input noise signal (B, 1, T). List: List of output tensors of each layer (for feat_match_loss).
Returns
----------
List
List of output tensors of each layer (for feat_match_loss).
""" """
outs = [] outs = []
for f in self.layers: for f in self.layers:
...@@ -440,39 +404,24 @@ class MelGANMultiScaleDiscriminator(nn.Layer): ...@@ -440,39 +404,24 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
use_weight_norm: bool=True, use_weight_norm: bool=True,
init_type: str="xavier_uniform", ): init_type: str="xavier_uniform", ):
"""Initilize MelGAN multi-scale discriminator module. """Initilize MelGAN multi-scale discriminator module.
Parameters
---------- Args:
in_channels : int in_channels (int): Number of input channels.
Number of input channels. out_channels (int): Number of output channels.
out_channels : int scales (int): Number of multi-scales.
Number of output channels. downsample_pooling (str): Pooling module name for downsampling of the inputs.
scales : int downsample_pooling_params (dict): Parameters for the above pooling module.
Number of multi-scales. kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer,
downsample_pooling : str
Pooling module name for downsampling of the inputs.
downsample_pooling_params : dict
Parameters for the above pooling module.
kernel_sizes : List[int]
List of two kernel sizes. The sum will be used for the first conv layer,
and the first and the second kernel sizes will be used for the last two layers. and the first and the second kernel sizes will be used for the last two layers.
channels : int channels (int): Initial number of channels for conv layer.
Initial number of channels for conv layer. max_downsample_channels (int): Maximum number of channels for downsampling layers.
max_downsample_channels : int bias (bool): Whether to add bias parameter in convolution layers.
Maximum number of channels for downsampling layers. downsample_scales (List[int]): List of downsampling scales.
bias : bool nonlinear_activation (str): Activation function module name.
Whether to add bias parameter in convolution layers. nonlinear_activation_params (dict): Hyperparameters for activation function.
downsample_scales : List[int] pad (str): Padding function module name before dilated convolution layer.
List of downsampling scales. pad_params (dict): Hyperparameters for padding function.
nonlinear_activation : str use_causal_conv (bool): Whether to use causal convolution.
Activation function module name.
nonlinear_activation_params : dict
Hyperparameters for activation function.
pad : str
Padding function module name before dilated convolution layer.
pad_params : dict
Hyperparameters for padding function.
use_causal_conv : bool
Whether to use causal convolution.
""" """
super().__init__() super().__init__()
...@@ -514,14 +463,10 @@ class MelGANMultiScaleDiscriminator(nn.Layer): ...@@ -514,14 +463,10 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
def forward(self, x): def forward(self, x):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- x (Tensor): Input noise signal (B, 1, T).
x : Tensor Returns:
Input noise signal (B, 1, T). List: List of list of each discriminator outputs, which consists of each layer output tensors.
Returns
----------
List
List of list of each discriminator outputs, which consists of each layer output tensors.
""" """
outs = [] outs = []
for f in self.discriminators: for f in self.discriminators:
......
...@@ -52,36 +52,22 @@ class StyleMelGANGenerator(nn.Layer): ...@@ -52,36 +52,22 @@ class StyleMelGANGenerator(nn.Layer):
use_weight_norm: bool=True, use_weight_norm: bool=True,
init_type: str="xavier_uniform", ): init_type: str="xavier_uniform", ):
"""Initilize Style MelGAN generator. """Initilize Style MelGAN generator.
Parameters
---------- Args:
in_channels : int in_channels (int): Number of input noise channels.
Number of input noise channels. aux_channels (int): Number of auxiliary input channels.
aux_channels : int channels (int): Number of channels for conv layer.
Number of auxiliary input channels. out_channels (int): Number of output channels.
channels : int kernel_size (int): Kernel size of conv layers.
Number of channels for conv layer. dilation (int): Dilation factor for conv layers.
out_channels : int bias (bool): Whether to add bias parameter in convolution layers.
Number of output channels. noise_upsample_scales (list): List of noise upsampling scales.
kernel_size : int noise_upsample_activation (str): Activation function module name for noise upsampling.
Kernel size of conv layers. noise_upsample_activation_params (dict): Hyperparameters for the above activation function.
dilation : int upsample_scales (list): List of upsampling scales.
Dilation factor for conv layers. upsample_mode (str): Upsampling mode in TADE layer.
bias : bool gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid").
Whether to add bias parameter in convolution layers. use_weight_norm (bool): Whether to use weight norm.
noise_upsample_scales : list
List of noise upsampling scales.
noise_upsample_activation : str
Activation function module name for noise upsampling.
noise_upsample_activation_params : dict
Hyperparameters for the above activation function.
upsample_scales : list
List of upsampling scales.
upsample_mode : str
Upsampling mode in TADE layer.
gated_function : str
Gated function in TADEResBlock ("softmax" or "sigmoid").
use_weight_norm : bool
Whether to use weight norm.
If set to true, it will be applied to all of the conv layers. If set to true, it will be applied to all of the conv layers.
""" """
super().__init__() super().__init__()
...@@ -147,16 +133,12 @@ class StyleMelGANGenerator(nn.Layer): ...@@ -147,16 +133,12 @@ class StyleMelGANGenerator(nn.Layer):
def forward(self, c, z=None): def forward(self, c, z=None):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters
---------- Args:
c : Tensor c (Tensor): Auxiliary input tensor (B, channels, T).
Auxiliary input tensor (B, channels, T). z (Tensor): Input noise tensor (B, in_channels, 1).
z : Tensor Returns:
Input noise tensor (B, in_channels, 1). Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
Returns
----------
Tensor
Output tensor (B, out_channels, T ** prod(upsample_scales)).
""" """
# batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300) # batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300)
if z is None: if z is None:
...@@ -211,14 +193,10 @@ class StyleMelGANGenerator(nn.Layer): ...@@ -211,14 +193,10 @@ class StyleMelGANGenerator(nn.Layer):
def inference(self, c): def inference(self, c):
"""Perform inference. """Perform inference.
Parameters Args:
---------- c (Tensor): Input tensor (T, in_channels).
c : Tensor Returns:
Input tensor (T, in_channels). Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
Returns
----------
Tensor
Output tensor (T ** prod(upsample_scales), out_channels).
""" """
# (1, in_channels, T) # (1, in_channels, T)
c = c.transpose([1, 0]).unsqueeze(0) c = c.transpose([1, 0]).unsqueeze(0)
...@@ -278,18 +256,13 @@ class StyleMelGANDiscriminator(nn.Layer): ...@@ -278,18 +256,13 @@ class StyleMelGANDiscriminator(nn.Layer):
use_weight_norm: bool=True, use_weight_norm: bool=True,
init_type: str="xavier_uniform", ): init_type: str="xavier_uniform", ):
"""Initilize Style MelGAN discriminator. """Initilize Style MelGAN discriminator.
Parameters
---------- Args:
repeats : int repeats (int): Number of repititons to apply RWD.
Number of repititons to apply RWD. window_sizes (list): List of random window sizes.
window_sizes : list pqmf_params (list): List of list of Parameters for PQMF modules
List of random window sizes. discriminator_params (dict): Parameters for base discriminator module.
pqmf_params : list use_weight_nom (bool): Whether to apply weight normalization.
List of list of Parameters for PQMF modules
discriminator_params : dict
Parameters for base discriminator module.
use_weight_nom : bool
Whether to apply weight normalization.
""" """
super().__init__() super().__init__()
...@@ -325,14 +298,10 @@ class StyleMelGANDiscriminator(nn.Layer): ...@@ -325,14 +298,10 @@ class StyleMelGANDiscriminator(nn.Layer):
def forward(self, x): def forward(self, x):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- x (Tensor): Input tensor (B, 1, T).
x : Tensor Returns:
Input tensor (B, 1, T). List: List of discriminator outputs, #items in the list will be
Returns
----------
List
List of discriminator outputs, #items in the list will be
equal to repeats * #discriminators. equal to repeats * #discriminators.
""" """
outs = [] outs = []
......
...@@ -31,51 +31,30 @@ from paddlespeech.t2s.modules.upsample import ConvInUpsampleNet ...@@ -31,51 +31,30 @@ from paddlespeech.t2s.modules.upsample import ConvInUpsampleNet
class PWGGenerator(nn.Layer): class PWGGenerator(nn.Layer):
"""Wave Generator for Parallel WaveGAN """Wave Generator for Parallel WaveGAN
Parameters Args:
---------- in_channels (int, optional): Number of channels of the input waveform, by default 1
in_channels : int, optional out_channels (int, optional): Number of channels of the output waveform, by default 1
Number of channels of the input waveform, by default 1 kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3
out_channels : int, optional layers (int, optional): Number of residual blocks inside, by default 30
Number of channels of the output waveform, by default 1 stacks (int, optional): The number of groups to split the residual blocks into, by default 3
kernel_size : int, optional Within each group, the dilation of the residual block grows exponentially.
Kernel size of the residual blocks inside, by default 3 residual_channels (int, optional): Residual channel of the residual blocks, by default 64
layers : int, optional gate_channels (int, optional): Gate channel of the residual blocks, by default 128
Number of residual blocks inside, by default 30 skip_channels (int, optional): Skip channel of the residual blocks, by default 64
stacks : int, optional aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80
The number of groups to split the residual blocks into, by default 3 aux_context_window (int, optional): The context window size of the first convolution applied to the
Within each group, the dilation of the residual block grows
exponentially.
residual_channels : int, optional
Residual channel of the residual blocks, by default 64
gate_channels : int, optional
Gate channel of the residual blocks, by default 128
skip_channels : int, optional
Skip channel of the residual blocks, by default 64
aux_channels : int, optional
Auxiliary channel of the residual blocks, by default 80
aux_context_window : int, optional
The context window size of the first convolution applied to the
auxiliary input, by default 2 auxiliary input, by default 2
dropout : float, optional dropout (float, optional): Dropout of the residual blocks, by default 0.
Dropout of the residual blocks, by default 0. bias (bool, optional): Whether to use bias in residual blocks, by default True
bias : bool, optional use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True
Whether to use bias in residual blocks, by default True use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual
use_weight_norm : bool, optional
Whether to use weight norm in all convolutions, by default True
use_causal_conv : bool, optional
Whether to use causal padding in the upsample network and residual
blocks, by default False blocks, by default False
upsample_scales : List[int], optional upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4]
Upsample scales of the upsample network, by default [4, 4, 4, 4] nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
nonlinear_activation : Optional[str], optional nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network,
Non linear activation in upsample network, by default None
nonlinear_activation_params : Dict[str, Any], optional
Parameters passed to the linear activation in the upsample network,
by default {} by default {}
interpolate_mode : str, optional interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest"
Interpolation mode of the upsample network, by default "nearest" freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1
freq_axis_kernel_size : int, optional
Kernel size along the frequency axis of the upsample network, by default 1
""" """
def __init__( def __init__(
...@@ -167,18 +146,13 @@ class PWGGenerator(nn.Layer): ...@@ -167,18 +146,13 @@ class PWGGenerator(nn.Layer):
def forward(self, x, c): def forward(self, x, c):
"""Generate waveform. """Generate waveform.
Parameters Args:
---------- x(Tensor): Shape (N, C_in, T), The input waveform.
x : Tensor c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It
Shape (N, C_in, T), The input waveform.
c : Tensor
Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It
is upsampled to match the time resolution of the input. is upsampled to match the time resolution of the input.
Returns Returns:
------- Tensor: Shape (N, C_out, T), the generated waveform.
Tensor
Shape (N, C_out, T), the generated waveform.
""" """
c = self.upsample_net(c) c = self.upsample_net(c)
assert c.shape[-1] == x.shape[-1] assert c.shape[-1] == x.shape[-1]
...@@ -218,19 +192,14 @@ class PWGGenerator(nn.Layer): ...@@ -218,19 +192,14 @@ class PWGGenerator(nn.Layer):
self.apply(_remove_weight_norm) self.apply(_remove_weight_norm)
def inference(self, c=None): def inference(self, c=None):
"""Waveform generation. This function is used for single instance """Waveform generation. This function is used for single instance inference.
inference.
Parameters Args:
---------- c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None
c : Tensor, optional x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None
Shape (T', C_aux), the auxiliary input, by default None
x : Tensor, optional Returns:
Shape (T, C_in), the noise waveform, by default None Tensor: Shape (T, C_out), the generated waveform
If not provided, a sample is drawn from a gaussian distribution.
Returns
-------
Tensor
Shape (T, C_out), the generated waveform
""" """
# when to static, can not input x, see https://github.com/PaddlePaddle/Parakeet/pull/132/files # when to static, can not input x, see https://github.com/PaddlePaddle/Parakeet/pull/132/files
x = paddle.randn( x = paddle.randn(
...@@ -244,31 +213,20 @@ class PWGGenerator(nn.Layer): ...@@ -244,31 +213,20 @@ class PWGGenerator(nn.Layer):
class PWGDiscriminator(nn.Layer): class PWGDiscriminator(nn.Layer):
"""A convolutional discriminator for audio. """A convolutional discriminator for audio.
Parameters Args:
---------- in_channels (int, optional): Number of channels of the input audio, by default 1
in_channels : int, optional out_channels (int, optional): Output feature size, by default 1
Number of channels of the input audio, by default 1 kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3
out_channels : int, optional layers (int, optional): Number of layers, by default 10
Output feature size, by default 1 conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64
kernel_size : int, optional dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows
Kernel size of convolutional sublayers, by default 3 exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly,
layers : int, optional by default 1
Number of layers, by default 10 nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu"
conv_channels : int, optional nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default
Feature size of the convolutional sublayers, by default 64
dilation_factor : int, optional
The factor with which dilation of each convolutional sublayers grows
exponentially if it is greater than 1, else the dilation of each
convolutional sublayers grows linearly, by default 1
nonlinear_activation : str, optional
The activation after each convolutional sublayer, by default "leakyrelu"
nonlinear_activation_params : Dict[str, Any], optional
The parameters passed to the activation's initializer, by default
{"negative_slope": 0.2} {"negative_slope": 0.2}
bias : bool, optional bias (bool, optional): Whether to use bias in convolutional sublayers, by default True
Whether to use bias in convolutional sublayers, by default True use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers,
use_weight_norm : bool, optional
Whether to use weight normalization at all convolutional sublayers,
by default True by default True
""" """
...@@ -330,15 +288,12 @@ class PWGDiscriminator(nn.Layer): ...@@ -330,15 +288,12 @@ class PWGDiscriminator(nn.Layer):
def forward(self, x): def forward(self, x):
""" """
Parameters
---------- Args:
x : Tensor x (Tensor): Shape (N, in_channels, num_samples), the input audio.
Shape (N, in_channels, num_samples), the input audio.
Returns:
Returns Tensor: Shape (N, out_channels, num_samples), the predicted logits.
-------
Tensor
Shape (N, out_channels, num_samples), the predicted logits.
""" """
return self.conv_layers(x) return self.conv_layers(x)
...@@ -362,39 +317,25 @@ class PWGDiscriminator(nn.Layer): ...@@ -362,39 +317,25 @@ class PWGDiscriminator(nn.Layer):
class ResidualPWGDiscriminator(nn.Layer): class ResidualPWGDiscriminator(nn.Layer):
"""A wavenet-style discriminator for audio. """A wavenet-style discriminator for audio.
Parameters Args:
---------- in_channels (int, optional): Number of channels of the input audio, by default 1
in_channels : int, optional out_channels (int, optional): Output feature size, by default 1
Number of channels of the input audio, by default 1 kernel_size (int, optional): Kernel size of residual blocks, by default 3
out_channels : int, optional layers (int, optional): Number of residual blocks, by default 30
Output feature size, by default 1 stacks (int, optional): Number of groups of residual blocks, within which the dilation
kernel_size : int, optional
Kernel size of residual blocks, by default 3
layers : int, optional
Number of residual blocks, by default 30
stacks : int, optional
Number of groups of residual blocks, within which the dilation
of each residual blocks grows exponentially, by default 3 of each residual blocks grows exponentially, by default 3
residual_channels : int, optional residual_channels (int, optional): Residual channels of residual blocks, by default 64
Residual channels of residual blocks, by default 64 gate_channels (int, optional): Gate channels of residual blocks, by default 128
gate_channels : int, optional skip_channels (int, optional): Skip channels of residual blocks, by default 64
Gate channels of residual blocks, by default 128 dropout (float, optional): Dropout probability of residual blocks, by default 0.
skip_channels : int, optional bias (bool, optional): Whether to use bias in residual blocks, by default True
Skip channels of residual blocks, by default 64 use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers,
dropout : float, optional
Dropout probability of residual blocks, by default 0.
bias : bool, optional
Whether to use bias in residual blocks, by default True
use_weight_norm : bool, optional
Whether to use weight normalization in all convolutional layers,
by default True by default True
use_causal_conv : bool, optional use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False
Whether to use causal convolution in residual blocks, by default False nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks,
nonlinear_activation : str, optional
Activation after convolutions other than those in residual blocks,
by default "leakyrelu" by default "leakyrelu"
nonlinear_activation_params : Dict[str, Any], optional nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation,
Parameters to pass to the activation, by default {"negative_slope": 0.2} by default {"negative_slope": 0.2}
""" """
def __init__( def __init__(
...@@ -463,15 +404,11 @@ class ResidualPWGDiscriminator(nn.Layer): ...@@ -463,15 +404,11 @@ class ResidualPWGDiscriminator(nn.Layer):
def forward(self, x): def forward(self, x):
""" """
Parameters Args:
---------- x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩
x : Tensor
Shape (N, in_channels, num_samples), the input audio. Returns:
Tensor: Shape (N, out_channels, num_samples), the predicted logits.
Returns
-------
Tensor
Shape (N, out_channels, num_samples), the predicted logits.
""" """
x = self.first_conv(x) x = self.first_conv(x)
skip = 0 skip = 0
......
...@@ -81,69 +81,39 @@ class Tacotron2(nn.Layer): ...@@ -81,69 +81,39 @@ class Tacotron2(nn.Layer):
# training related # training related
init_type: str="xavier_uniform", ): init_type: str="xavier_uniform", ):
"""Initialize Tacotron2 module. """Initialize Tacotron2 module.
Parameters Args:
---------- idim (int): Dimension of the inputs.
idim : int odim (int): Dimension of the outputs.
Dimension of the inputs. embed_dim (int): Dimension of the token embedding.
odim : int elayers (int): Number of encoder blstm layers.
Dimension of the outputs. eunits (int): Number of encoder blstm units.
embed_dim : int econv_layers (int): Number of encoder conv layers.
Dimension of the token embedding. econv_filts (int): Number of encoder conv filter size.
elayers : int econv_chans (int): Number of encoder conv filter channels.
Number of encoder blstm layers. dlayers (int): Number of decoder lstm layers.
eunits : int dunits (int): Number of decoder lstm units.
Number of encoder blstm units. prenet_layers (int): Number of prenet layers.
econv_layers : int prenet_units (int): Number of prenet units.
Number of encoder conv layers. postnet_layers (int): Number of postnet layers.
econv_filts : int postnet_filts (int): Number of postnet filter size.
Number of encoder conv filter size. postnet_chans (int): Number of postnet filter channels.
econv_chans : int output_activation (str): Name of activation function for outputs.
Number of encoder conv filter channels. adim (int): Number of dimension of mlp in attention.
dlayers : int aconv_chans (int): Number of attention conv filter channels.
Number of decoder lstm layers. aconv_filts (int): Number of attention conv filter size.
dunits : int cumulate_att_w (bool): Whether to cumulate previous attention weight.
Number of decoder lstm units. use_batch_norm (bool): Whether to use batch normalization.
prenet_layers : int use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs.
Number of prenet layers. reduction_factor (int): Reduction factor.
prenet_units : int spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the
Number of prenet units.
postnet_layers : int
Number of postnet layers.
postnet_filts : int
Number of postnet filter size.
postnet_chans : int
Number of postnet filter channels.
output_activation : str
Name of activation function for outputs.
adim : int
Number of dimension of mlp in attention.
aconv_chans : int
Number of attention conv filter channels.
aconv_filts : int
Number of attention conv filter size.
cumulate_att_w : bool
Whether to cumulate previous attention weight.
use_batch_norm : bool
Whether to use batch normalization.
use_concate : bool
Whether to concat enc outputs w/ dec lstm outputs.
reduction_factor : int
Reduction factor.
spk_num : Optional[int]
Number of speakers. If set to > 1, assume that the
sids will be provided as the input and use sid embedding layer. sids will be provided as the input and use sid embedding layer.
lang_num : Optional[int] lang_num (Optional[int]): Number of languages. If set to > 1, assume that the
Number of languages. If set to > 1, assume that the
lids will be provided as the input and use sid embedding layer. lids will be provided as the input and use sid embedding layer.
spk_embed_dim : Optional[int] spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
Speaker embedding dimension. If set to > 0,
assume that spk_emb will be provided as the input. assume that spk_emb will be provided as the input.
spk_embed_integration_type : str spk_embed_integration_type (str): How to integrate speaker embedding.
How to integrate speaker embedding. dropout_rate (float): Dropout rate.
dropout_rate : float zoneout_rate (float): Zoneout rate.
Dropout rate.
zoneout_rate : float
Zoneout rate.
""" """
assert check_argument_types() assert check_argument_types()
super().__init__() super().__init__()
...@@ -258,31 +228,19 @@ class Tacotron2(nn.Layer): ...@@ -258,31 +228,19 @@ class Tacotron2(nn.Layer):
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- text (Tensor(int64)): Batch of padded character ids (B, T_text).
text : Tensor(int64) text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,).
Batch of padded character ids (B, T_text). speech (Tensor): Batch of padded target features (B, T_feats, odim).
text_lengths : Tensor(int64) speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,).
Batch of lengths of each input batch (B,). spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
speech : Tensor spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1).
Batch of padded target features (B, T_feats, odim). lang_id (Optional[Tensor]): Batch of language IDs (B, 1).
speech_lengths : Tensor(int64)
Batch of the lengths of each target (B,). Returns:
spk_emb : Optional[Tensor] Tensor: Loss scalar value.
Batch of speaker embeddings (B, spk_embed_dim). Dict: Statistics to be monitored.
spk_id : Optional[Tensor] Tensor: Weight value if not joint training else model outputs.
Batch of speaker IDs (B, 1).
lang_id : Optional[Tensor]
Batch of language IDs (B, 1).
Returns
----------
Tensor
Loss scalar value.
Dict
Statistics to be monitored.
Tensor
Weight value if not joint training else model outputs.
""" """
text = text[:, :text_lengths.max()] text = text[:, :text_lengths.max()]
...@@ -369,35 +327,21 @@ class Tacotron2(nn.Layer): ...@@ -369,35 +327,21 @@ class Tacotron2(nn.Layer):
use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]: use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
"""Generate the sequence of features given the sequences of characters. """Generate the sequence of features given the sequences of characters.
Parameters Args:
---------- text (Tensor(int64)): Input sequence of characters (T_text,).
text Tensor(int64) speech (Optional[Tensor]): Feature sequence to extract style (N, idim).
Input sequence of characters (T_text,). spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,).
speech : Optional[Tensor] spk_id (Optional[Tensor]): Speaker ID (1,).
Feature sequence to extract style (N, idim). lang_id (Optional[Tensor]): Language ID (1,).
spk_emb : ptional[Tensor] threshold (float): Threshold in inference.
Speaker embedding (spk_embed_dim,). minlenratio (float): Minimum length ratio in inference.
spk_id : Optional[Tensor] maxlenratio (float): Maximum length ratio in inference.
Speaker ID (1,). use_att_constraint (bool): Whether to apply attention constraint.
lang_id : Optional[Tensor] backward_window (int): Backward window in attention constraint.
Language ID (1,). forward_window (int): Forward window in attention constraint.
threshold : float use_teacher_forcing (bool): Whether to use teacher forcing.
Threshold in inference.
minlenratio : float Returns:
Minimum length ratio in inference.
maxlenratio : float
Maximum length ratio in inference.
use_att_constraint : bool
Whether to apply attention constraint.
backward_window : int
Backward window in attention constraint.
forward_window : int
Forward window in attention constraint.
use_teacher_forcing : bool
Whether to use teacher forcing.
Return
----------
Dict[str, Tensor] Dict[str, Tensor]
Output dict including the following items: Output dict including the following items:
* feat_gen (Tensor): Output sequence of features (T_feats, odim). * feat_gen (Tensor): Output sequence of features (T_feats, odim).
...@@ -458,17 +402,12 @@ class Tacotron2(nn.Layer): ...@@ -458,17 +402,12 @@ class Tacotron2(nn.Layer):
spk_emb: paddle.Tensor) -> paddle.Tensor: spk_emb: paddle.Tensor) -> paddle.Tensor:
"""Integrate speaker embedding with hidden states. """Integrate speaker embedding with hidden states.
Parameters Args:
---------- hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits).
hs : Tensor spk_emb (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
Batch of hidden state sequences (B, Tmax, eunits).
spk_emb : Tensor Returns:
Batch of speaker embeddings (B, spk_embed_dim). Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if
Returns
----------
Tensor
Batch of integrated hidden state sequences (B, Tmax, eunits) if
integration_type is "add" else (B, Tmax, eunits + spk_embed_dim). integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).
""" """
......
...@@ -48,126 +48,66 @@ class TransformerTTS(nn.Layer): ...@@ -48,126 +48,66 @@ class TransformerTTS(nn.Layer):
.. _`Neural Speech Synthesis with Transformer Network`: .. _`Neural Speech Synthesis with Transformer Network`:
https://arxiv.org/pdf/1809.08895.pdf https://arxiv.org/pdf/1809.08895.pdf
Parameters Args:
---------- idim (int): Dimension of the inputs.
idim : int odim (int): Dimension of the outputs.
Dimension of the inputs. embed_dim (int, optional): Dimension of character embedding.
odim : int eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers.
Dimension of the outputs. eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels.
embed_dim : int, optional eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution.
Dimension of character embedding. dprenet_layers (int, optional): Number of decoder prenet layers.
eprenet_conv_layers : int, optional dprenet_units (int, optional): Number of decoder prenet hidden units.
Number of encoder prenet convolution layers. elayers (int, optional): Number of encoder layers.
eprenet_conv_chans : int, optional eunits (int, optional): Number of encoder hidden units.
Number of encoder prenet convolution channels. adim (int, optional): Number of attention transformation dimensions.
eprenet_conv_filts : int, optional aheads (int, optional): Number of heads for multi head attention.
Filter size of encoder prenet convolution. dlayers (int, optional): Number of decoder layers.
dprenet_layers : int, optional dunits (int, optional): Number of decoder hidden units.
Number of decoder prenet layers. postnet_layers (int, optional): Number of postnet layers.
dprenet_units : int, optional postnet_chans (int, optional): Number of postnet channels.
Number of decoder prenet hidden units. postnet_filts (int, optional): Filter size of postnet.
elayers : int, optional use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding.
Number of encoder layers. use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet.
eunits : int, optional encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block.
Number of encoder hidden units. decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block.
adim : int, optional encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder.
Number of attention transformation dimensions. decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder.
aheads : int, optional positionwise_layer_type (str, optional): Position-wise operation type.
Number of heads for multi head attention. positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d.
dlayers : int, optional reduction_factor (int, optional): Reduction factor.
Number of decoder layers. spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
dunits : int, optional spk_embed_integration_type (str, optional): How to integrate speaker embedding.
Number of decoder hidden units. use_gst (str, optional): Whether to use global style token.
postnet_layers : int, optional gst_tokens (int, optional): The number of GST embeddings.
Number of postnet layers. gst_heads (int, optional): The number of heads in GST multihead attention.
postnet_chans : int, optional gst_conv_layers (int, optional): The number of conv layers in GST.
Number of postnet channels. gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST.
postnet_filts : int, optional gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
Filter size of postnet. gst_conv_stride (int, optional): Stride size of conv layers in GST.
use_scaled_pos_enc : pool, optional gst_gru_layers (int, optional): The number of GRU layers in GST.
Whether to use trainable scaled positional encoding. gst_gru_units (int, optional): The number of GRU units in GST.
use_batch_norm : bool, optional transformer_lr (float, optional): Initial value of learning rate.
Whether to use batch normalization in encoder prenet. transformer_warmup_steps (int, optional): Optimizer warmup steps.
encoder_normalize_before : bool, optional transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding.
Whether to perform layer normalization before encoder block. transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding.
decoder_normalize_before : bool, optional transformer_enc_attn_dropout_rate (float, optional): Dropout rate in encoder self-attention module.
Whether to perform layer normalization before decoder block. transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding.
encoder_concat_after : bool, optional transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding.
Whether to concatenate attention layer's input and output in encoder. transformer_dec_attn_dropout_rate (float, optional): Dropout rate in deocoder self-attention module.
decoder_concat_after : bool, optional transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module.
Whether to concatenate attention layer's input and output in decoder. init_type (str, optional): How to initialize transformer parameters.
positionwise_layer_type : str, optional init_enc_alpha (float, optional): Initial value of alpha in scaled pos encoding of the encoder.
Position-wise operation type. init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder.
positionwise_conv_kernel_size : int, optional eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet.
Kernel size in position wise conv 1d. dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet.
reduction_factor : int, optional postnet_dropout_rate (float, optional): Dropout rate in postnet.
Reduction factor. use_masking (bool, optional): Whether to apply masking for padded part in loss calculation.
spk_embed_dim : int, optional use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation.
Number of speaker embedding dimenstions. bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true).
spk_embed_integration_type : str, optional loss_type (str, optional): How to calculate loss.
How to integrate speaker embedding. use_guided_attn_loss (bool, optional): Whether to use guided attention loss.
use_gst : str, optional num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss.
Whether to use global style token. num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss.
gst_tokens : int, optional
The number of GST embeddings.
gst_heads : int, optional
The number of heads in GST multihead attention.
gst_conv_layers : int, optional
The number of conv layers in GST.
gst_conv_chans_list : Sequence[int], optional
List of the number of channels of conv layers in GST.
gst_conv_kernel_size : int, optional
Kernal size of conv layers in GST.
gst_conv_stride : int, optional
Stride size of conv layers in GST.
gst_gru_layers : int, optional
The number of GRU layers in GST.
gst_gru_units : int, optional
The number of GRU units in GST.
transformer_lr : float, optional
Initial value of learning rate.
transformer_warmup_steps : int, optional
Optimizer warmup steps.
transformer_enc_dropout_rate : float, optional
Dropout rate in encoder except attention and positional encoding.
transformer_enc_positional_dropout_rate : float, optional
Dropout rate after encoder positional encoding.
transformer_enc_attn_dropout_rate : float, optional
Dropout rate in encoder self-attention module.
transformer_dec_dropout_rate : float, optional
Dropout rate in decoder except attention & positional encoding.
transformer_dec_positional_dropout_rate : float, optional
Dropout rate after decoder positional encoding.
transformer_dec_attn_dropout_rate : float, optional
Dropout rate in deocoder self-attention module.
transformer_enc_dec_attn_dropout_rate : float, optional
Dropout rate in encoder-deocoder attention module.
init_type : str, optional
How to initialize transformer parameters.
init_enc_alpha : float, optional
Initial value of alpha in scaled pos encoding of the encoder.
init_dec_alpha : float, optional
Initial value of alpha in scaled pos encoding of the decoder.
eprenet_dropout_rate : float, optional
Dropout rate in encoder prenet.
dprenet_dropout_rate : float, optional
Dropout rate in decoder prenet.
postnet_dropout_rate : float, optional
Dropout rate in postnet.
use_masking : bool, optional
Whether to apply masking for padded part in loss calculation.
use_weighted_masking : bool, optional
Whether to apply weighted masking in loss calculation.
bce_pos_weight : float, optional
Positive sample weight in bce calculation (only for use_masking=true).
loss_type : str, optional
How to calculate loss.
use_guided_attn_loss : bool, optional
Whether to use guided attention loss.
num_heads_applied_guided_attn : int, optional
Number of heads in each layer to apply guided attention loss.
num_layers_applied_guided_attn : int, optional
Number of layers to apply guided attention loss.
List of module names to apply guided attention loss. List of module names to apply guided attention loss.
""" """
...@@ -398,25 +338,16 @@ class TransformerTTS(nn.Layer): ...@@ -398,25 +338,16 @@ class TransformerTTS(nn.Layer):
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- text(Tensor(int64)): Batch of padded character ids (B, Tmax).
text : Tensor(int64) text_lengths(Tensor(int64)): Batch of lengths of each input batch (B,).
Batch of padded character ids (B, Tmax). speech(Tensor): Batch of padded target features (B, Lmax, odim).
text_lengths : Tensor(int64) speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
Batch of lengths of each input batch (B,). spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
speech : Tensor
Batch of padded target features (B, Lmax, odim). Returns:
speech_lengths : Tensor(int64) Tensor: Loss scalar value.
Batch of the lengths of each target (B,). Dict: Statistics to be monitored.
spk_emb : Tensor, optional
Batch of speaker embeddings (B, spk_embed_dim).
Returns
----------
Tensor
Loss scalar value.
Dict
Statistics to be monitored.
""" """
# input of embedding must be int64 # input of embedding must be int64
...@@ -525,31 +456,19 @@ class TransformerTTS(nn.Layer): ...@@ -525,31 +456,19 @@ class TransformerTTS(nn.Layer):
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Generate the sequence of features given the sequences of characters. """Generate the sequence of features given the sequences of characters.
Parameters Args:
---------- text(Tensor(int64)): Input sequence of characters (T,).
text : Tensor(int64) speech(Tensor, optional): Feature sequence to extract style (N, idim).
Input sequence of characters (T,). spk_emb(Tensor, optional): Speaker embedding vector (spk_embed_dim,).
speech : Tensor, optional threshold(float, optional): Threshold in inference.
Feature sequence to extract style (N, idim). minlenratio(float, optional): Minimum length ratio in inference.
spk_emb : Tensor, optional maxlenratio(float, optional): Maximum length ratio in inference.
Speaker embedding vector (spk_embed_dim,). use_teacher_forcing(bool, optional): Whether to use teacher forcing.
threshold : float, optional
Threshold in inference. Returns:
minlenratio : float, optional Tensor: Output sequence of features (L, odim).
Minimum length ratio in inference. Tensor: Output sequence of stop probabilities (L,).
maxlenratio : float, optional Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T).
Maximum length ratio in inference.
use_teacher_forcing : bool, optional
Whether to use teacher forcing.
Returns
----------
Tensor
Output sequence of features (L, odim).
Tensor
Output sequence of stop probabilities (L,).
Tensor
Encoder-decoder (source) attention weights (#layers, #heads, L, T).
""" """
# input of embedding must be int64 # input of embedding must be int64
...@@ -671,19 +590,13 @@ class TransformerTTS(nn.Layer): ...@@ -671,19 +590,13 @@ class TransformerTTS(nn.Layer):
def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor: def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
"""Make masks for self-attention. """Make masks for self-attention.
Parameters Args:
---------- ilens(Tensor): Batch of lengths (B,).
ilens : Tensor
Batch of lengths (B,).
Returns Returns:
------- Tensor: Mask tensor for self-attention. dtype=paddle.bool
Tensor
Mask tensor for self-attention.
dtype=paddle.bool
Examples Examples:
-------
>>> ilens = [5, 3] >>> ilens = [5, 3]
>>> self._source_mask(ilens) >>> self._source_mask(ilens)
tensor([[[1, 1, 1, 1, 1], tensor([[[1, 1, 1, 1, 1],
...@@ -696,18 +609,13 @@ class TransformerTTS(nn.Layer): ...@@ -696,18 +609,13 @@ class TransformerTTS(nn.Layer):
def _target_mask(self, olens: paddle.Tensor) -> paddle.Tensor: def _target_mask(self, olens: paddle.Tensor) -> paddle.Tensor:
"""Make masks for masked self-attention. """Make masks for masked self-attention.
Parameters Args:
---------- olens (Tensor(int64)): Batch of lengths (B,).
olens : LongTensor
Batch of lengths (B,).
Returns Returns:
---------- Tensor: Mask tensor for masked self-attention.
Tensor
Mask tensor for masked self-attention.
Examples Examples:
----------
>>> olens = [5, 3] >>> olens = [5, 3]
>>> self._target_mask(olens) >>> self._target_mask(olens)
tensor([[[1, 0, 0, 0, 0], tensor([[[1, 0, 0, 0, 0],
...@@ -731,17 +639,12 @@ class TransformerTTS(nn.Layer): ...@@ -731,17 +639,12 @@ class TransformerTTS(nn.Layer):
spk_emb: paddle.Tensor) -> paddle.Tensor: spk_emb: paddle.Tensor) -> paddle.Tensor:
"""Integrate speaker embedding with hidden states. """Integrate speaker embedding with hidden states.
Parameters Args:
---------- hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
hs : Tensor spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
Batch of hidden state sequences (B, Tmax, adim).
spk_emb : Tensor Returns:
Batch of speaker embeddings (B, spk_embed_dim). Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).
Returns
----------
Tensor
Batch of integrated hidden state sequences (B, Tmax, adim).
""" """
if self.spk_embed_integration_type == "add": if self.spk_embed_integration_type == "add":
......
此差异已折叠。
...@@ -67,14 +67,10 @@ class MelResNet(nn.Layer): ...@@ -67,14 +67,10 @@ class MelResNet(nn.Layer):
def forward(self, x): def forward(self, x):
''' '''
Parameters Args:
---------- x (Tensor): Input tensor (B, in_dims, T).
x : Tensor Returns:
Input tensor (B, in_dims, T). Tensor: Output tensor (B, res_out_dims, T).
Returns
----------
Tensor
Output tensor (B, res_out_dims, T).
''' '''
x = self.conv_in(x) x = self.conv_in(x)
...@@ -121,16 +117,11 @@ class UpsampleNetwork(nn.Layer): ...@@ -121,16 +117,11 @@ class UpsampleNetwork(nn.Layer):
def forward(self, m): def forward(self, m):
''' '''
Parameters Args:
---------- c (Tensor): Input tensor (B, C_aux, T).
c : Tensor Returns:
Input tensor (B, C_aux, T). Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux).
Returns Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims).
----------
Tensor
Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux).
Tensor
Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims).
''' '''
# aux: [B, C_aux, T] # aux: [B, C_aux, T]
# -> [B, res_out_dims, T - 2 * aux_context_window] # -> [B, res_out_dims, T - 2 * aux_context_window]
...@@ -172,32 +163,20 @@ class WaveRNN(nn.Layer): ...@@ -172,32 +163,20 @@ class WaveRNN(nn.Layer):
mode='RAW', mode='RAW',
init_type: str="xavier_uniform", ): init_type: str="xavier_uniform", ):
''' '''
Parameters Args:
---------- rnn_dims (int, optional): Hidden dims of RNN Layers.
rnn_dims : int, optional fc_dims (int, optional): Dims of FC Layers.
Hidden dims of RNN Layers. bits (int, optional): bit depth of signal.
fc_dims : int, optional aux_context_window (int, optional): The context window size of the first convolution applied to the
Dims of FC Layers.
bits : int, optional
bit depth of signal.
aux_context_window : int, optional
The context window size of the first convolution applied to the
auxiliary input, by default 2 auxiliary input, by default 2
upsample_scales : List[int], optional upsample_scales (List[int], optional): Upsample scales of the upsample network.
Upsample scales of the upsample network. aux_channels (int, optional): Auxiliary channel of the residual blocks.
aux_channels : int, optional compute_dims (int, optional): Dims of Conv1D in MelResNet.
Auxiliary channel of the residual blocks. res_out_dims (int, optional): Dims of output in MelResNet.
compute_dims : int, optional res_blocks (int, optional): Number of residual blocks.
Dims of Conv1D in MelResNet. mode (str, optional): Output mode of the WaveRNN vocoder.
res_out_dims : int, optional `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output.
Dims of output in MelResNet. init_type (str): How to initialize parameters.
res_blocks : int, optional
Number of residual blocks.
mode : str, optional
Output mode of the WaveRNN vocoder. `MOL` for Mixture of Logistic Distribution,
and `RAW` for quantized bits as the model's output.
init_type : str
How to initialize parameters.
''' '''
super().__init__() super().__init__()
self.mode = mode self.mode = mode
...@@ -245,18 +224,13 @@ class WaveRNN(nn.Layer): ...@@ -245,18 +224,13 @@ class WaveRNN(nn.Layer):
def forward(self, x, c): def forward(self, x, c):
''' '''
Parameters Args:
---------- x (Tensor): wav sequence, [B, T]
x : Tensor c (Tensor): mel spectrogram [B, C_aux, T']
wav sequence, [B, T]
c : Tensor
mel spectrogram [B, C_aux, T']
T = (T' - 2 * aux_context_window ) * hop_length T = (T' - 2 * aux_context_window ) * hop_length
Returns Returns:
---------- Tensor: [B, T, n_classes]
Tensor
[B, T, n_classes]
''' '''
# Although we `_flatten_parameters()` on init, when using DataParallel # Although we `_flatten_parameters()` on init, when using DataParallel
# the model gets replicated, making it no longer guaranteed that the # the model gets replicated, making it no longer guaranteed that the
...@@ -304,22 +278,14 @@ class WaveRNN(nn.Layer): ...@@ -304,22 +278,14 @@ class WaveRNN(nn.Layer):
mu_law: bool=True, mu_law: bool=True,
gen_display: bool=False): gen_display: bool=False):
""" """
Parameters Args:
---------- c(Tensor): input mels, (T', C_aux)
c : Tensor batched(bool): generate in batch or not
input mels, (T', C_aux) target(int): target number of samples to be generated in each batch entry
batched : bool overlap(int): number of samples for crossfading between batches
generate in batch or not mu_law(bool)
target : int Returns:
target number of samples to be generated in each batch entry wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
overlap : int
number of samples for crossfading between batches
mu_law : bool
use mu law or not
Returns
----------
wav sequence
Output (T' * prod(upsample_scales), out_channels, C_out).
""" """
self.eval() self.eval()
...@@ -434,15 +400,12 @@ class WaveRNN(nn.Layer): ...@@ -434,15 +400,12 @@ class WaveRNN(nn.Layer):
def pad_tensor(self, x, pad, side='both'): def pad_tensor(self, x, pad, side='both'):
''' '''
Parameters Args:
---------- x(Tensor): mel, [1, n_frames, 80]
x : Tensor pad(int):
mel, [1, n_frames, 80] side(str, optional): (Default value = 'both')
pad : int
side : str Returns:
'both', 'before' or 'after'
Returns
----------
Tensor Tensor
''' '''
b, t, _ = paddle.shape(x) b, t, _ = paddle.shape(x)
...@@ -461,33 +424,24 @@ class WaveRNN(nn.Layer): ...@@ -461,33 +424,24 @@ class WaveRNN(nn.Layer):
Fold the tensor with overlap for quick batched inference. Fold the tensor with overlap for quick batched inference.
Overlap will be used for crossfading in xfade_and_unfold() Overlap will be used for crossfading in xfade_and_unfold()
Parameters Args:
---------- x(Tensor): Upsampled conditioning features. mels or aux
x : Tensor
Upsampled conditioning features. mels or aux
shape=(1, T, features) shape=(1, T, features)
mels: [1, T, 80] mels: [1, T, 80]
aux: [1, T, 128] aux: [1, T, 128]
target : int target(int): Target timesteps for each index of batch
Target timesteps for each index of batch overlap(int): Timesteps for both xfade and rnn warmup
overlap : int
Timesteps for both xfade and rnn warmup Returns:
overlap = hop_length * 2 Tensor:
Returns
----------
Tensor
shape=(num_folds, target + 2 * overlap, features) shape=(num_folds, target + 2 * overlap, features)
num_flods = (time_seq - overlap) // (target + overlap) num_flods = (time_seq - overlap) // (target + overlap)
mel: [num_folds, target + 2 * overlap, 80] mel: [num_folds, target + 2 * overlap, 80]
aux: [num_folds, target + 2 * overlap, 128] aux: [num_folds, target + 2 * overlap, 128]
Details Details:
----------
x = [[h1, h2, ... hn]] x = [[h1, h2, ... hn]]
Where each h is a vector of conditioning features Where each h is a vector of conditioning features
Eg: target=2, overlap=1 with x.size(1)=10 Eg: target=2, overlap=1 with x.size(1)=10
folded = [[h1, h2, h3, h4], folded = [[h1, h2, h3, h4],
...@@ -520,24 +474,20 @@ class WaveRNN(nn.Layer): ...@@ -520,24 +474,20 @@ class WaveRNN(nn.Layer):
def xfade_and_unfold(self, y, target: int=12000, overlap: int=600): def xfade_and_unfold(self, y, target: int=12000, overlap: int=600):
''' Applies a crossfade and unfolds into a 1d array. ''' Applies a crossfade and unfolds into a 1d array.
Parameters Args:
---------- y (Tensor):
y : Tensor
Batched sequences of audio samples Batched sequences of audio samples
shape=(num_folds, target + 2 * overlap) shape=(num_folds, target + 2 * overlap)
dtype=paddle.float32 dtype=paddle.float32
overlap : int overlap (int): Timesteps for both xfade and rnn warmup
Timesteps for both xfade and rnn warmup
Returns Returns:
----------
Tensor Tensor
audio samples in a 1d array audio samples in a 1d array
shape=(total_len) shape=(total_len)
dtype=paddle.float32 dtype=paddle.float32
Details Details:
----------
y = [[seq1], y = [[seq1],
[seq2], [seq2],
[seq3]] [seq3]]
......
...@@ -41,14 +41,10 @@ class CausalConv1D(nn.Layer): ...@@ -41,14 +41,10 @@ class CausalConv1D(nn.Layer):
def forward(self, x): def forward(self, x):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- x (Tensor): Input tensor (B, in_channels, T).
x : Tensor Returns:
Input tensor (B, in_channels, T). Tensor: Output tensor (B, out_channels, T).
Returns
----------
Tensor
Output tensor (B, out_channels, T).
""" """
return self.conv(self.pad(x))[:, :, :x.shape[2]] return self.conv(self.pad(x))[:, :, :x.shape[2]]
...@@ -70,13 +66,9 @@ class CausalConv1DTranspose(nn.Layer): ...@@ -70,13 +66,9 @@ class CausalConv1DTranspose(nn.Layer):
def forward(self, x): def forward(self, x):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- x (Tensor): Input tensor (B, in_channels, T_in).
x : Tensor Returns:
Input tensor (B, in_channels, T_in). Tensor: Output tensor (B, out_channels, T_out).
Returns
----------
Tensor
Output tensor (B, out_channels, T_out).
""" """
return self.deconv(x)[:, :, :-self.stride] return self.deconv(x)[:, :, :-self.stride]
...@@ -18,12 +18,10 @@ from paddle import nn ...@@ -18,12 +18,10 @@ from paddle import nn
class ConvolutionModule(nn.Layer): class ConvolutionModule(nn.Layer):
"""ConvolutionModule in Conformer model. """ConvolutionModule in Conformer model.
Parameters
---------- Args:
channels : int channels (int): The number of channels of conv layers.
The number of channels of conv layers. kernel_size (int): Kernerl size of conv layers.
kernel_size : int
Kernerl size of conv layers.
""" """
def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True): def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
...@@ -59,14 +57,11 @@ class ConvolutionModule(nn.Layer): ...@@ -59,14 +57,11 @@ class ConvolutionModule(nn.Layer):
def forward(self, x): def forward(self, x):
"""Compute convolution module. """Compute convolution module.
Parameters
---------- Args:
x : paddle.Tensor x (Tensor): Input tensor (#batch, time, channels).
Input tensor (#batch, time, channels). Returns:
Returns Tensor: Output tensor (#batch, time, channels).
----------
paddle.Tensor
Output tensor (#batch, time, channels).
""" """
# exchange the temporal dimension and the feature dimension # exchange the temporal dimension and the feature dimension
x = x.transpose([0, 2, 1]) x = x.transpose([0, 2, 1])
......
...@@ -21,36 +21,27 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm ...@@ -21,36 +21,27 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm
class EncoderLayer(nn.Layer): class EncoderLayer(nn.Layer):
"""Encoder layer module. """Encoder layer module.
Parameters
---------- Args:
size : int size (int): Input dimension.
Input dimension. self_attn (nn.Layer): Self-attention module instance.
self_attn : nn.Layer
Self-attention module instance.
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
can be used as the argument. can be used as the argument.
feed_forward : nn.Layer feed_forward (nn.Layer): Feed-forward module instance.
Feed-forward module instance.
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
can be used as the argument. can be used as the argument.
feed_forward_macaron : nn.Layer feed_forward_macaron (nn.Layer): Additional feed-forward module instance.
Additional feed-forward module instance.
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
can be used as the argument. can be used as the argument.
conv_module : nn.Layer conv_module (nn.Layer): Convolution module instance.
Convolution module instance.
`ConvlutionModule` instance can be used as the argument. `ConvlutionModule` instance can be used as the argument.
dropout_rate : float dropout_rate (float): Dropout rate.
Dropout rate. normalize_before (bool): Whether to use layer_norm before the first block.
normalize_before : bool concat_after (bool): Whether to concat attention layer's input and output.
Whether to use layer_norm before the first block.
concat_after : bool
Whether to concat attention layer's input and output.
if True, additional linear will be applied. if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x))) i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x) if False, no additional linear will be applied. i.e. x -> x + att(x)
stochastic_depth_rate : float stochastic_depth_rate (float): Proability to skip this layer.
Proability to skip this layer.
During training, the layer may skip residual computation and return input During training, the layer may skip residual computation and return input
as-is with given probability. as-is with given probability.
""" """
...@@ -93,22 +84,17 @@ class EncoderLayer(nn.Layer): ...@@ -93,22 +84,17 @@ class EncoderLayer(nn.Layer):
def forward(self, x_input, mask, cache=None): def forward(self, x_input, mask, cache=None):
"""Compute encoded features. """Compute encoded features.
Parameters
---------- Args:
x_input : Union[Tuple, paddle.Tensor] x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb.
Input tensor w/ or w/o pos emb.
- w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)]. - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
- w/o pos emb: Tensor (#batch, time, size). - w/o pos emb: Tensor (#batch, time, size).
mask : paddle.Tensor mask(Tensor): Mask tensor for the input (#batch, time).
Mask tensor for the input (#batch, time). cache (Tensor):
cache paddle.Tensor
Cache tensor of the input (#batch, time - 1, size). Returns:
Returns Tensor: Output tensor (#batch, time, size).
---------- Tensor: Mask tensor (#batch, time).
paddle.Tensor
Output tensor (#batch, time, size).
paddle.Tensor
Mask tensor (#batch, time).
""" """
if isinstance(x_input, tuple): if isinstance(x_input, tuple):
x, pos_emb = x_input[0], x_input[1] x, pos_emb = x_input[0], x_input[1]
......
...@@ -41,24 +41,17 @@ class Conv1dCell(nn.Conv1D): ...@@ -41,24 +41,17 @@ class Conv1dCell(nn.Conv1D):
Thus, these arguments are removed from the ``__init__`` method of this Thus, these arguments are removed from the ``__init__`` method of this
class. class.
Parameters Args:
---------- in_channels (int): The feature size of the input.
in_channels: int out_channels (int): The feature size of the output.
The feature size of the input. kernel_size (int or Tuple[int]): The size of the kernel.
out_channels: int dilation (int or Tuple[int]): The dilation of the convolution, by default 1
The feature size of the output. weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel,
kernel_size: int or Tuple[int] by default None.
The size of the kernel. bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias.
dilation: int or Tuple[int] If ``False``, this layer does not have a bias, by default None.
The dilation of the convolution, by default 1
weight_attr: ParamAttr, Initializer, str or bool, optional Examples:
The parameter attribute of the convolution kernel, by default None.
bias_attr: ParamAttr, Initializer, str or bool, optional
The parameter attribute of the bias. If ``False``, this layer does not
have a bias, by default None.
Examples
--------
>>> cell = Conv1dCell(3, 4, kernel_size=5) >>> cell = Conv1dCell(3, 4, kernel_size=5)
>>> inputs = [paddle.randn([4, 3]) for _ in range(16)] >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
>>> outputs = [] >>> outputs = []
...@@ -103,13 +96,11 @@ class Conv1dCell(nn.Conv1D): ...@@ -103,13 +96,11 @@ class Conv1dCell(nn.Conv1D):
def start_sequence(self): def start_sequence(self):
"""Prepare the layer for a series of incremental forward. """Prepare the layer for a series of incremental forward.
Warnings Warnings:
---------
This method should be called before a sequence of calls to This method should be called before a sequence of calls to
``add_input``. ``add_input``.
Raises Raises:
------
Exception Exception
If this method is called when the layer is in training mode. If this method is called when the layer is in training mode.
""" """
...@@ -130,10 +121,9 @@ class Conv1dCell(nn.Conv1D): ...@@ -130,10 +121,9 @@ class Conv1dCell(nn.Conv1D):
def initialize_buffer(self, x_t): def initialize_buffer(self, x_t):
"""Initialize the buffer for the step input. """Initialize the buffer for the step input.
Parameters Args:
---------- x_t (Tensor): The step input. shape=(batch_size, in_channels)
x_t : Tensor [shape=(batch_size, in_channels)]
The step input.
""" """
batch_size, _ = x_t.shape batch_size, _ = x_t.shape
self._buffer = paddle.zeros( self._buffer = paddle.zeros(
...@@ -143,10 +133,9 @@ class Conv1dCell(nn.Conv1D): ...@@ -143,10 +133,9 @@ class Conv1dCell(nn.Conv1D):
def update_buffer(self, x_t): def update_buffer(self, x_t):
"""Shift the buffer by one step. """Shift the buffer by one step.
Parameters Args:
---------- x_t (Tensor): The step input. shape=(batch_size, in_channels)
x_t : Tensor [shape=(batch_size, in_channels)]
The step input.
""" """
self._buffer = paddle.concat( self._buffer = paddle.concat(
[self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1) [self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1)
...@@ -154,15 +143,12 @@ class Conv1dCell(nn.Conv1D): ...@@ -154,15 +143,12 @@ class Conv1dCell(nn.Conv1D):
def add_input(self, x_t): def add_input(self, x_t):
"""Add step input and compute step output. """Add step input and compute step output.
Parameters Args:
----------- x_t (Tensor): The step input. shape=(batch_size, in_channels)
x_t : Tensor [shape=(batch_size, in_channels)]
The step input. Returns:
y_t (Tensor): The step output. shape=(batch_size, out_channels)
Returns
-------
y_t :Tensor [shape=(batch_size, out_channels)]
The step output.
""" """
batch_size = x_t.shape[0] batch_size = x_t.shape[0]
if self.receptive_field > 1: if self.receptive_field > 1:
...@@ -186,33 +172,26 @@ class Conv1dCell(nn.Conv1D): ...@@ -186,33 +172,26 @@ class Conv1dCell(nn.Conv1D):
class Conv1dBatchNorm(nn.Layer): class Conv1dBatchNorm(nn.Layer):
"""A Conv1D Layer followed by a BatchNorm1D. """A Conv1D Layer followed by a BatchNorm1D.
Parameters Args:
---------- in_channels (int): The feature size of the input.
in_channels : int out_channels (int): The feature size of the output.
The feature size of the input. kernel_size (int): The size of the convolution kernel.
out_channels : int stride (int, optional): The stride of the convolution, by default 1.
The feature size of the output. padding (int, str or Tuple[int], optional):
kernel_size : int
The size of the convolution kernel.
stride : int, optional
The stride of the convolution, by default 1.
padding : int, str or Tuple[int], optional
The padding of the convolution. The padding of the convolution.
If int, a symmetrical padding is applied before convolution; If int, a symmetrical padding is applied before convolution;
If str, it should be "same" or "valid"; If str, it should be "same" or "valid";
If Tuple[int], its length should be 2, meaning If Tuple[int], its length should be 2, meaning
``(pad_before, pad_after)``, by default 0. ``(pad_before, pad_after)``, by default 0.
weight_attr : ParamAttr, Initializer, str or bool, optional weight_attr (ParamAttr, Initializer, str or bool, optional):
The parameter attribute of the convolution kernel, by default None. The parameter attribute of the convolution kernel,
bias_attr : ParamAttr, Initializer, str or bool, optional by default None.
The parameter attribute of the bias of the convolution, by default bias_attr (ParamAttr, Initializer, str or bool, optional):
None. The parameter attribute of the bias of the convolution,
data_format : str ["NCL" or "NLC"], optional by defaultNone.
The data layout of the input, by default "NCL" data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL"
momentum : float, optional momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9
The momentum of the BatchNorm1D layer, by default 0.9 epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05
epsilon : [type], optional
The epsilon of the BatchNorm1D layer, by default 1e-05
""" """
def __init__(self, def __init__(self,
...@@ -245,15 +224,14 @@ class Conv1dBatchNorm(nn.Layer): ...@@ -245,15 +224,14 @@ class Conv1dBatchNorm(nn.Layer):
def forward(self, x): def forward(self, x):
"""Forward pass of the Conv1dBatchNorm layer. """Forward pass of the Conv1dBatchNorm layer.
Parameters Args:
---------- x (Tensor): The input tensor. Its data layout depends on ``data_format``.
x : Tensor [shape=(B, C_in, T_in) or (B, T_in, C_in)] shape=(B, C_in, T_in) or (B, T_in, C_in)
The input tensor. Its data layout depends on ``data_format``.
Returns:
Tensor: The output tensor.
shape=(B, C_out, T_out) or (B, T_out, C_out)
Returns
-------
Tensor [shape=(B, C_out, T_out) or (B, T_out, C_out)]
The output tensor.
""" """
x = self.conv(x) x = self.conv(x)
x = self.bn(x) x = self.bn(x)
......
...@@ -18,23 +18,17 @@ import paddle ...@@ -18,23 +18,17 @@ import paddle
def shuffle_dim(x, axis, perm=None): def shuffle_dim(x, axis, perm=None):
"""Permute input tensor along aixs given the permutation or randomly. """Permute input tensor along aixs given the permutation or randomly.
Parameters Args:
---------- x (Tensor): The input tensor.
x : Tensor axis (int): The axis to shuffle.
The input tensor. perm (List[int], ndarray, optional):
axis : int
The axis to shuffle.
perm : List[int], ndarray, optional
The order to reorder the tensor along the ``axis``-th dimension. The order to reorder the tensor along the ``axis``-th dimension.
It is a permutation of ``[0, d)``, where d is the size of the It is a permutation of ``[0, d)``, where d is the size of the
``axis``-th dimension of the input tensor. If not provided, ``axis``-th dimension of the input tensor. If not provided,
a random permutation is used. Defaults to None. a random permutation is used. Defaults to None.
Returns Returns:
--------- Tensor: The shuffled tensor, which has the same shape as x does.
Tensor
The shuffled tensor, which has the same shape as x does.
""" """
size = x.shape[axis] size = x.shape[axis]
if perm is not None and len(perm) != size: if perm is not None and len(perm) != size:
......
...@@ -18,13 +18,9 @@ from paddle import nn ...@@ -18,13 +18,9 @@ from paddle import nn
class LayerNorm(nn.LayerNorm): class LayerNorm(nn.LayerNorm):
"""Layer normalization module. """Layer normalization module.
Args:
Parameters nout (int): Output dim size.
---------- dim (int): Dimension to be normalized.
nout : int
Output dim size.
dim : int
Dimension to be normalized.
""" """
def __init__(self, nout, dim=-1): def __init__(self, nout, dim=-1):
...@@ -35,15 +31,11 @@ class LayerNorm(nn.LayerNorm): ...@@ -35,15 +31,11 @@ class LayerNorm(nn.LayerNorm):
def forward(self, x): def forward(self, x):
"""Apply layer normalization. """Apply layer normalization.
Parameters Args:
---------- x (Tensor):Input tensor.
x : paddle.Tensor
Input tensor.
Returns Returns:
---------- Tensor: Normalized tensor.
paddle.Tensor
Normalized tensor.
""" """
if self.dim == -1: if self.dim == -1:
......
...@@ -118,16 +118,13 @@ def discretized_mix_logistic_loss(y_hat, ...@@ -118,16 +118,13 @@ def discretized_mix_logistic_loss(y_hat,
def sample_from_discretized_mix_logistic(y, log_scale_min=None): def sample_from_discretized_mix_logistic(y, log_scale_min=None):
""" """
Sample from discretized mixture of logistic distributions Sample from discretized mixture of logistic distributions
Parameters
---------- Args:
y : Tensor y(Tensor): (B, C, T)
(B, C, T) log_scale_min(float, optional): (Default value = None)
log_scale_min : float
Log scale minimum value Returns:
Returns Tensor: sample in range of [-1, 1].
----------
Tensor
sample in range of [-1, 1].
""" """
if log_scale_min is None: if log_scale_min is None:
log_scale_min = float(np.log(1e-14)) log_scale_min = float(np.log(1e-14))
...@@ -181,14 +178,10 @@ class GuidedAttentionLoss(nn.Layer): ...@@ -181,14 +178,10 @@ class GuidedAttentionLoss(nn.Layer):
def __init__(self, sigma=0.4, alpha=1.0, reset_always=True): def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
"""Initialize guided attention loss module. """Initialize guided attention loss module.
Parameters Args:
---------- sigma (float, optional): Standard deviation to control how close attention to a diagonal.
sigma : float, optional alpha (float, optional): Scaling coefficient (lambda).
Standard deviation to control how close attention to a diagonal. reset_always (bool, optional): Whether to always reset masks.
alpha : float, optional
Scaling coefficient (lambda).
reset_always : bool, optional
Whether to always reset masks.
""" """
super().__init__() super().__init__()
...@@ -205,19 +198,13 @@ class GuidedAttentionLoss(nn.Layer): ...@@ -205,19 +198,13 @@ class GuidedAttentionLoss(nn.Layer):
def forward(self, att_ws, ilens, olens): def forward(self, att_ws, ilens, olens):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- att_ws(Tensor): Batch of attention weights (B, T_max_out, T_max_in).
att_ws : Tensor ilens(Tensor(int64)): Batch of input lenghts (B,).
Batch of attention weights (B, T_max_out, T_max_in). olens(Tensor(int64)): Batch of output lenghts (B,).
ilens : Tensor(int64)
Batch of input lenghts (B,). Returns:
olens : Tensor(int64) Tensor: Guided attention loss value.
Batch of output lenghts (B,).
Returns
----------
Tensor
Guided attention loss value.
""" """
if self.guided_attn_masks is None: if self.guided_attn_masks is None:
...@@ -282,20 +269,14 @@ class GuidedAttentionLoss(nn.Layer): ...@@ -282,20 +269,14 @@ class GuidedAttentionLoss(nn.Layer):
def _make_masks(ilens, olens): def _make_masks(ilens, olens):
"""Make masks indicating non-padded part. """Make masks indicating non-padded part.
Parameters Args:
---------- ilens(Tensor(int64) or List): Batch of lengths (B,).
ilens : Tensor(int64) or List olens(Tensor(int64) or List): Batch of lengths (B,).
Batch of lengths (B,).
olens : Tensor(int64) or List
Batch of lengths (B,).
Returns Returns:
---------- Tensor: Mask tensor indicating non-padded part.
Tensor
Mask tensor indicating non-padded part.
Examples Examples:
----------
>>> ilens, olens = [5, 2], [8, 5] >>> ilens, olens = [5, 2], [8, 5]
>>> _make_mask(ilens, olens) >>> _make_mask(ilens, olens)
tensor([[[1, 1, 1, 1, 1], tensor([[[1, 1, 1, 1, 1],
...@@ -330,34 +311,24 @@ class GuidedAttentionLoss(nn.Layer): ...@@ -330,34 +311,24 @@ class GuidedAttentionLoss(nn.Layer):
class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
"""Guided attention loss function module for multi head attention. """Guided attention loss function module for multi head attention.
Parameters Args:
---------- sigma (float, optional): Standard deviation to controlGuidedAttentionLoss
sigma : float, optional
Standard deviation to controlGuidedAttentionLoss
how close attention to a diagonal. how close attention to a diagonal.
alpha : float, optional alpha (float, optional): Scaling coefficient (lambda).
Scaling coefficient (lambda). reset_always (bool, optional): Whether to always reset masks.
reset_always : bool, optional
Whether to always reset masks.
""" """
def forward(self, att_ws, ilens, olens): def forward(self, att_ws, ilens, olens):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in).
att_ws : Tensor ilens(Tensor): Batch of input lenghts (B,).
Batch of multi head attention weights (B, H, T_max_out, T_max_in). olens(Tensor): Batch of output lenghts (B,).
ilens : Tensor
Batch of input lenghts (B,). Returns:
olens : Tensor Tensor: Guided attention loss value.
Batch of output lenghts (B,).
Returns
----------
Tensor
Guided attention loss value.
""" """
if self.guided_attn_masks is None: if self.guided_attn_masks is None:
...@@ -382,14 +353,11 @@ class Tacotron2Loss(nn.Layer): ...@@ -382,14 +353,11 @@ class Tacotron2Loss(nn.Layer):
use_weighted_masking=False, use_weighted_masking=False,
bce_pos_weight=20.0): bce_pos_weight=20.0):
"""Initialize Tactoron2 loss module. """Initialize Tactoron2 loss module.
Parameters
---------- Args:
use_masking : bool use_masking (bool): Whether to apply masking for padded part in loss calculation.
Whether to apply masking for padded part in loss calculation. use_weighted_masking (bool): Whether to apply weighted masking in loss calculation.
use_weighted_masking : bool bce_pos_weight (float): Weight of positive sample of stop token.
Whether to apply weighted masking in loss calculation.
bce_pos_weight : float
Weight of positive sample of stop token.
""" """
super().__init__() super().__init__()
assert (use_masking != use_weighted_masking) or not use_masking assert (use_masking != use_weighted_masking) or not use_masking
...@@ -405,28 +373,19 @@ class Tacotron2Loss(nn.Layer): ...@@ -405,28 +373,19 @@ class Tacotron2Loss(nn.Layer):
def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens): def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters
---------- Args:
after_outs : Tensor after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
Batch of outputs after postnets (B, Lmax, odim). before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
before_outs : Tensor logits(Tensor): Batch of stop logits (B, Lmax).
Batch of outputs before postnets (B, Lmax, odim). ys(Tensor): Batch of padded target features (B, Lmax, odim).
logits : Tensor stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax).
Batch of stop logits (B, Lmax). olens(Tensor(int64)):
ys : Tensor
Batch of padded target features (B, Lmax, odim). Returns:
stop_labels : Tensor(int64) Tensor: L1 loss value.
Batch of the sequences of stop token labels (B, Lmax). Tensor: Mean square error loss value.
olens : Tensor(int64) Tensor: Binary cross entropy loss value.
Batch of the lengths of each target (B,).
Returns
----------
Tensor
L1 loss value.
Tensor
Mean square error loss value.
Tensor
Binary cross entropy loss value.
""" """
# make mask and apply it # make mask and apply it
if self.use_masking: if self.use_masking:
...@@ -513,28 +472,20 @@ def stft(x, ...@@ -513,28 +472,20 @@ def stft(x,
center=True, center=True,
pad_mode='reflect'): pad_mode='reflect'):
"""Perform STFT and convert to magnitude spectrogram. """Perform STFT and convert to magnitude spectrogram.
Parameters Args:
---------- x(Tensor): Input signal tensor (B, T).
x : Tensor fft_size(int): FFT size.
Input signal tensor (B, T). hop_size(int): Hop size.
fft_size : int win_length(int, optional): window : str, optional (Default value = None)
FFT size. window(str, optional): Name of window function, see `scipy.signal.get_window` for more
hop_size : int
Hop size.
win_length : int
window : str, optional
window : str
Name of window function, see `scipy.signal.get_window` for more
details. Defaults to "hann". details. Defaults to "hann".
center : bool, optional center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the
center (bool, optional): Whether to pad `x` to make that the
:math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`. :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
pad_mode : str, optional pad_mode(str, optional, optional): (Default value = 'reflect')
Choose padding pattern when `center` is `True`. hop_length: (Default value = None)
Returns
---------- Returns:
Tensor: Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
""" """
# calculate window # calculate window
window = signal.get_window(window, win_length, fftbins=True) window = signal.get_window(window, win_length, fftbins=True)
...@@ -564,16 +515,11 @@ class SpectralConvergenceLoss(nn.Layer): ...@@ -564,16 +515,11 @@ class SpectralConvergenceLoss(nn.Layer):
def forward(self, x_mag, y_mag): def forward(self, x_mag, y_mag):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
x_mag : Tensor y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). Returns:
y_mag : Tensor) Tensor: Spectral convergence loss value.
Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
Returns
----------
Tensor
Spectral convergence loss value.
""" """
return paddle.norm( return paddle.norm(
y_mag - x_mag, p="fro") / paddle.clip( y_mag - x_mag, p="fro") / paddle.clip(
...@@ -590,16 +536,11 @@ class LogSTFTMagnitudeLoss(nn.Layer): ...@@ -590,16 +536,11 @@ class LogSTFTMagnitudeLoss(nn.Layer):
def forward(self, x_mag, y_mag): def forward(self, x_mag, y_mag):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
x_mag : Tensor y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). Returns:
y_mag : Tensor Tensor: Log STFT magnitude loss value.
Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
Returns
----------
Tensor
Log STFT magnitude loss value.
""" """
return F.l1_loss( return F.l1_loss(
paddle.log(paddle.clip(y_mag, min=self.epsilon)), paddle.log(paddle.clip(y_mag, min=self.epsilon)),
...@@ -625,18 +566,12 @@ class STFTLoss(nn.Layer): ...@@ -625,18 +566,12 @@ class STFTLoss(nn.Layer):
def forward(self, x, y): def forward(self, x, y):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- x (Tensor): Predicted signal (B, T).
x : Tensor y (Tensor): Groundtruth signal (B, T).
Predicted signal (B, T). Returns:
y : Tensor Tensor: Spectral convergence loss value.
Groundtruth signal (B, T). Tensor: Log STFT magnitude loss value.
Returns
----------
Tensor
Spectral convergence loss value.
Tensor
Log STFT magnitude loss value.
""" """
x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
self.window) self.window)
...@@ -658,16 +593,11 @@ class MultiResolutionSTFTLoss(nn.Layer): ...@@ -658,16 +593,11 @@ class MultiResolutionSTFTLoss(nn.Layer):
win_lengths=[600, 1200, 240], win_lengths=[600, 1200, 240],
window="hann", ): window="hann", ):
"""Initialize Multi resolution STFT loss module. """Initialize Multi resolution STFT loss module.
Parameters Args:
---------- fft_sizes (list): List of FFT sizes.
fft_sizes : list hop_sizes (list): List of hop sizes.
List of FFT sizes. win_lengths (list): List of window lengths.
hop_sizes : list window (str): Window function type.
List of hop sizes.
win_lengths : list
List of window lengths.
window : str
Window function type.
""" """
super().__init__() super().__init__()
assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
...@@ -677,18 +607,13 @@ class MultiResolutionSTFTLoss(nn.Layer): ...@@ -677,18 +607,13 @@ class MultiResolutionSTFTLoss(nn.Layer):
def forward(self, x, y): def forward(self, x, y):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters
---------- Args:
x : Tensor x (Tensor): Predicted signal (B, T) or (B, #subband, T).
Predicted signal (B, T) or (B, #subband, T). y (Tensor): Groundtruth signal (B, T) or (B, #subband, T).
y : Tensor Returns:
Groundtruth signal (B, T) or (B, #subband, T). Tensor: Multi resolution spectral convergence loss value.
Returns Tensor: Multi resolution log STFT magnitude loss value.
----------
Tensor
Multi resolution spectral convergence loss value.
Tensor
Multi resolution log STFT magnitude loss value.
""" """
if len(x.shape) == 3: if len(x.shape) == 3:
# (B, C, T) -> (B x C, T) # (B, C, T) -> (B x C, T)
...@@ -725,14 +650,10 @@ class GeneratorAdversarialLoss(nn.Layer): ...@@ -725,14 +650,10 @@ class GeneratorAdversarialLoss(nn.Layer):
def forward(self, outputs): def forward(self, outputs):
"""Calcualate generator adversarial loss. """Calcualate generator adversarial loss.
Parameters Args:
---------- outputs (Tensor or List): Discriminator outputs or list of discriminator outputs.
outputs: Tensor or List Returns:
Discriminator outputs or list of discriminator outputs. Tensor: Generator adversarial loss value.
Returns
----------
Tensor
Generator adversarial loss value.
""" """
if isinstance(outputs, (tuple, list)): if isinstance(outputs, (tuple, list)):
adv_loss = 0.0 adv_loss = 0.0
...@@ -772,20 +693,15 @@ class DiscriminatorAdversarialLoss(nn.Layer): ...@@ -772,20 +693,15 @@ class DiscriminatorAdversarialLoss(nn.Layer):
def forward(self, outputs_hat, outputs): def forward(self, outputs_hat, outputs):
"""Calcualate discriminator adversarial loss. """Calcualate discriminator adversarial loss.
Parameters
---------- Args:
outputs_hat : Tensor or list outputs_hat (Tensor or list): Discriminator outputs or list of
Discriminator outputs or list of
discriminator outputs calculated from generator outputs. discriminator outputs calculated from generator outputs.
outputs : Tensor or list outputs (Tensor or list): Discriminator outputs or list of
Discriminator outputs or list of
discriminator outputs calculated from groundtruth. discriminator outputs calculated from groundtruth.
Returns Returns:
---------- Tensor: Discriminator real loss value.
Tensor Tensor: Discriminator fake loss value.
Discriminator real loss value.
Tensor
Discriminator fake loss value.
""" """
if isinstance(outputs, (tuple, list)): if isinstance(outputs, (tuple, list)):
real_loss = 0.0 real_loss = 0.0
...@@ -868,17 +784,13 @@ def ssim(img1, img2, window_size=11, size_average=True): ...@@ -868,17 +784,13 @@ def ssim(img1, img2, window_size=11, size_average=True):
def weighted_mean(input, weight): def weighted_mean(input, weight):
"""Weighted mean. It can also be used as masked mean. """Weighted mean. It can also be used as masked mean.
Parameters Args:
----------- input(Tensor): The input tensor.
input : Tensor weight(Tensor): The weight tensor with broadcastable shape with the input.
The input tensor.
weight : Tensor Returns:
The weight tensor with broadcastable shape with the input. Tensor: Weighted mean tensor with the same dtype as input. shape=(1,)
Returns
----------
Tensor [shape=(1,)]
Weighted mean tensor with the same dtype as input.
""" """
weight = paddle.cast(weight, input.dtype) weight = paddle.cast(weight, input.dtype)
# paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__ # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__
...@@ -889,20 +801,15 @@ def weighted_mean(input, weight): ...@@ -889,20 +801,15 @@ def weighted_mean(input, weight):
def masked_l1_loss(prediction, target, mask): def masked_l1_loss(prediction, target, mask):
"""Compute maksed L1 loss. """Compute maksed L1 loss.
Parameters Args:
---------- prediction(Tensor): The prediction.
prediction : Tensor target(Tensor): The target. The shape should be broadcastable to ``prediction``.
The prediction. mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of
target : Tensor
The target. The shape should be broadcastable to ``prediction``.
mask : Tensor
The mask. The shape should be broadcatable to the broadcasted shape of
``prediction`` and ``target``. ``prediction`` and ``target``.
Returns Returns:
------- Tensor: The masked L1 loss. shape=(1,)
Tensor [shape=(1,)]
The masked L1 loss.
""" """
abs_error = F.l1_loss(prediction, target, reduction='none') abs_error = F.l1_loss(prediction, target, reduction='none')
loss = weighted_mean(abs_error, mask) loss = weighted_mean(abs_error, mask)
...@@ -975,14 +882,11 @@ class MelSpectrogram(nn.Layer): ...@@ -975,14 +882,11 @@ class MelSpectrogram(nn.Layer):
def forward(self, x): def forward(self, x):
"""Calculate Mel-spectrogram. """Calculate Mel-spectrogram.
Parameters Args:
----------
x : Tensor x (Tensor): Input waveform tensor (B, T) or (B, 1, T).
Input waveform tensor (B, T) or (B, 1, T). Returns:
Returns Tensor: Mel-spectrogram (B, #mels, #frames).
----------
Tensor
Mel-spectrogram (B, #mels, #frames).
""" """
if len(x.shape) == 3: if len(x.shape) == 3:
# (B, C, T) -> (B*C, T) # (B, C, T) -> (B*C, T)
...@@ -1047,16 +951,12 @@ class MelSpectrogramLoss(nn.Layer): ...@@ -1047,16 +951,12 @@ class MelSpectrogramLoss(nn.Layer):
def forward(self, y_hat, y): def forward(self, y_hat, y):
"""Calculate Mel-spectrogram loss. """Calculate Mel-spectrogram loss.
Parameters Args:
---------- y_hat(Tensor): Generated single tensor (B, 1, T).
y_hat : Tensor y(Tensor): Groundtruth single tensor (B, 1, T).
Generated single tensor (B, 1, T).
y : Tensor Returns:
Groundtruth single tensor (B, 1, T). Tensor: Mel-spectrogram loss value.
Returns
----------
Tensor
Mel-spectrogram loss value.
""" """
mel_hat = self.mel_spectrogram(y_hat) mel_hat = self.mel_spectrogram(y_hat)
mel = self.mel_spectrogram(y) mel = self.mel_spectrogram(y)
...@@ -1081,18 +981,14 @@ class FeatureMatchLoss(nn.Layer): ...@@ -1081,18 +981,14 @@ class FeatureMatchLoss(nn.Layer):
def forward(self, feats_hat, feats): def forward(self, feats_hat, feats):
"""Calcualate feature matching loss. """Calcualate feature matching loss.
Parameters
---------- Args:
feats_hat : list feats_hat(list): List of list of discriminator outputs
List of list of discriminator outputs
calcuated from generater outputs. calcuated from generater outputs.
feats : list feats(list): List of list of discriminator outputs
List of list of discriminator outputs
calcuated from groundtruth. Returns:
Returns Tensor: Feature matching loss value.
----------
Tensor
Feature matching loss value.
""" """
feat_match_loss = 0.0 feat_match_loss = 0.0
......
...@@ -20,20 +20,14 @@ from typeguard import check_argument_types ...@@ -20,20 +20,14 @@ from typeguard import check_argument_types
def pad_list(xs, pad_value): def pad_list(xs, pad_value):
"""Perform padding for the list of tensors. """Perform padding for the list of tensors.
Parameters Args:
---------- xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
xs : List[Tensor] pad_value (float): Value for padding.
List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
pad_value : float) Returns:
Value for padding. Tensor: Padded tensor (B, Tmax, `*`).
Returns Examples:
----------
Tensor
Padded tensor (B, Tmax, `*`).
Examples
----------
>>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])] >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
>>> x >>> x
[tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
...@@ -55,18 +49,13 @@ def pad_list(xs, pad_value): ...@@ -55,18 +49,13 @@ def pad_list(xs, pad_value):
def make_pad_mask(lengths, length_dim=-1): def make_pad_mask(lengths, length_dim=-1):
"""Make mask tensor containing indices of padded part. """Make mask tensor containing indices of padded part.
Parameters Args:
---------- lengths (Tensor(int64)): Batch of lengths (B,).
lengths : LongTensor
Batch of lengths (B,).
Returns Returns:
---------- Tensor(bool): Mask tensor containing indices of padded part bool.
Tensor(bool)
Mask tensor containing indices of padded part bool.
Examples Examples:
----------
With only lengths. With only lengths.
>>> lengths = [5, 3, 2] >>> lengths = [5, 3, 2]
...@@ -91,24 +80,17 @@ def make_pad_mask(lengths, length_dim=-1): ...@@ -91,24 +80,17 @@ def make_pad_mask(lengths, length_dim=-1):
def make_non_pad_mask(lengths, length_dim=-1): def make_non_pad_mask(lengths, length_dim=-1):
"""Make mask tensor containing indices of non-padded part. """Make mask tensor containing indices of non-padded part.
Parameters Args:
---------- lengths (Tensor(int64) or List): Batch of lengths (B,).
lengths : LongTensor or List xs (Tensor, optional): The reference tensor.
Batch of lengths (B,).
xs : Tensor, optional
The reference tensor.
If set, masks will be the same shape as this tensor. If set, masks will be the same shape as this tensor.
length_dim : int, optional length_dim (int, optional): Dimension indicator of the above tensor.
Dimension indicator of the above tensor.
See the example. See the example.
Returns Returns:
---------- Tensor(bool): mask tensor containing indices of padded part bool.
Tensor(bool)
mask tensor containing indices of padded part bool.
Examples Examples:
----------
With only lengths. With only lengths.
>>> lengths = [5, 3, 2] >>> lengths = [5, 3, 2]
...@@ -127,12 +109,9 @@ def initialize(model: nn.Layer, init: str): ...@@ -127,12 +109,9 @@ def initialize(model: nn.Layer, init: str):
Custom initialization routines can be implemented into submodules Custom initialization routines can be implemented into submodules
Parameters Args:
---------- model (nn.Layer): Target.
model : nn.Layer init (str): Method of initialization.
Target.
init : str
Method of initialization.
""" """
assert check_argument_types() assert check_argument_types()
......
...@@ -24,17 +24,13 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0): ...@@ -24,17 +24,13 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
"""Design prototype filter for PQMF. """Design prototype filter for PQMF.
This method is based on `A Kaiser window approach for the design of prototype This method is based on `A Kaiser window approach for the design of prototype
filters of cosine modulated filterbanks`_. filters of cosine modulated filterbanks`_.
Parameters
---------- Args:
taps : int taps (int): The number of filter taps.
The number of filter taps. cutoff_ratio (float): Cut-off frequency ratio.
cutoff_ratio : float beta (float): Beta coefficient for kaiser window.
Cut-off frequency ratio. Returns:
beta : float ndarray:
Beta coefficient for kaiser window.
Returns
----------
ndarray
Impluse response of prototype filter (taps + 1,). Impluse response of prototype filter (taps + 1,).
.. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
https://ieeexplore.ieee.org/abstract/document/681427 https://ieeexplore.ieee.org/abstract/document/681427
...@@ -68,16 +64,12 @@ class PQMF(nn.Layer): ...@@ -68,16 +64,12 @@ class PQMF(nn.Layer):
"""Initilize PQMF module. """Initilize PQMF module.
The cutoff_ratio and beta parameters are optimized for #subbands = 4. The cutoff_ratio and beta parameters are optimized for #subbands = 4.
See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195. See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
Parameters
---------- Args:
subbands : int subbands (int): The number of subbands.
The number of subbands. taps (int): The number of filter taps.
taps : int cutoff_ratio (float): Cut-off frequency ratio.
The number of filter taps. beta (float): Beta coefficient for kaiser window.
cutoff_ratio : float
Cut-off frequency ratio.
beta : float
Beta coefficient for kaiser window.
""" """
super().__init__() super().__init__()
...@@ -110,28 +102,20 @@ class PQMF(nn.Layer): ...@@ -110,28 +102,20 @@ class PQMF(nn.Layer):
def analysis(self, x): def analysis(self, x):
"""Analysis with PQMF. """Analysis with PQMF.
Parameters Args:
---------- x (Tensor): Input tensor (B, 1, T).
x : Tensor Returns:
Input tensor (B, 1, T). Tensor: Output tensor (B, subbands, T // subbands).
Returns
----------
Tensor
Output tensor (B, subbands, T // subbands).
""" """
x = F.conv1d(self.pad_fn(x), self.analysis_filter) x = F.conv1d(self.pad_fn(x), self.analysis_filter)
return F.conv1d(x, self.updown_filter, stride=self.subbands) return F.conv1d(x, self.updown_filter, stride=self.subbands)
def synthesis(self, x): def synthesis(self, x):
"""Synthesis with PQMF. """Synthesis with PQMF.
Parameters Args:
---------- x (Tensor): Input tensor (B, subbands, T // subbands).
x : Tensor Returns:
Input tensor (B, subbands, T // subbands). Tensor: Output tensor (B, 1, T).
Returns
----------
Tensor
Output tensor (B, 1, T).
""" """
x = F.conv1d_transpose( x = F.conv1d_transpose(
x, self.updown_filter * self.subbands, stride=self.subbands) x, self.updown_filter * self.subbands, stride=self.subbands)
......
...@@ -49,20 +49,13 @@ class DurationPredictor(nn.Layer): ...@@ -49,20 +49,13 @@ class DurationPredictor(nn.Layer):
offset=1.0): offset=1.0):
"""Initilize duration predictor module. """Initilize duration predictor module.
Parameters Args:
---------- idim (int):Input dimension.
idim : int n_layers (int, optional): Number of convolutional layers.
Input dimension. n_chans (int, optional): Number of channels of convolutional layers.
n_layers : int, optional kernel_size (int, optional): Kernel size of convolutional layers.
Number of convolutional layers. dropout_rate (float, optional): Dropout rate.
n_chans : int, optional offset (float, optional): Offset value to avoid nan in log domain.
Number of channels of convolutional layers.
kernel_size : int, optional
Kernel size of convolutional layers.
dropout_rate : float, optional
Dropout rate.
offset : float, optional
Offset value to avoid nan in log domain.
""" """
super().__init__() super().__init__()
...@@ -105,35 +98,23 @@ class DurationPredictor(nn.Layer): ...@@ -105,35 +98,23 @@ class DurationPredictor(nn.Layer):
def forward(self, xs, x_masks=None): def forward(self, xs, x_masks=None):
"""Calculate forward propagation. """Calculate forward propagation.
Args:
xs(Tensor): Batch of input sequences (B, Tmax, idim).
x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
Parameters Returns:
---------- Tensor: Batch of predicted durations in log domain (B, Tmax).
xs : Tensor
Batch of input sequences (B, Tmax, idim).
x_masks : ByteTensor, optional
Batch of masks indicating padded part (B, Tmax).
Returns
----------
Tensor
Batch of predicted durations in log domain (B, Tmax).
""" """
return self._forward(xs, x_masks, False) return self._forward(xs, x_masks, False)
def inference(self, xs, x_masks=None): def inference(self, xs, x_masks=None):
"""Inference duration. """Inference duration.
Args:
xs(Tensor): Batch of input sequences (B, Tmax, idim).
x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
Parameters Returns:
---------- Tensor: Batch of predicted durations in linear domain int64 (B, Tmax).
xs : Tensor
Batch of input sequences (B, Tmax, idim).
x_masks : Tensor(bool), optional
Batch of masks indicating padded part (B, Tmax).
Returns
----------
Tensor
Batch of predicted durations in linear domain int64 (B, Tmax).
""" """
return self._forward(xs, x_masks, True) return self._forward(xs, x_masks, True)
...@@ -147,13 +128,9 @@ class DurationPredictorLoss(nn.Layer): ...@@ -147,13 +128,9 @@ class DurationPredictorLoss(nn.Layer):
def __init__(self, offset=1.0, reduction="mean"): def __init__(self, offset=1.0, reduction="mean"):
"""Initilize duration predictor loss module. """Initilize duration predictor loss module.
Args:
Parameters offset (float, optional): Offset value to avoid nan in log domain.
---------- reduction (str): Reduction type in loss calculation.
offset : float, optional
Offset value to avoid nan in log domain.
reduction : str
Reduction type in loss calculation.
""" """
super().__init__() super().__init__()
self.criterion = nn.MSELoss(reduction=reduction) self.criterion = nn.MSELoss(reduction=reduction)
...@@ -162,20 +139,14 @@ class DurationPredictorLoss(nn.Layer): ...@@ -162,20 +139,14 @@ class DurationPredictorLoss(nn.Layer):
def forward(self, outputs, targets): def forward(self, outputs, targets):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- outputs(Tensor): Batch of prediction durations in log domain (B, T)
outputs : Tensor targets(Tensor): Batch of groundtruth durations in linear domain (B, T)
Batch of prediction durations in log domain (B, T)
targets : Tensor
Batch of groundtruth durations in linear domain (B, T)
Returns Returns:
---------- Tensor: Mean squared error loss value.
Tensor
Mean squared error loss value.
Note Note:
----------
`outputs` is in log domain but `targets` is in linear domain. `outputs` is in log domain but `targets` is in linear domain.
""" """
# NOTE: outputs is in log domain while targets in linear # NOTE: outputs is in log domain while targets in linear
......
...@@ -35,10 +35,8 @@ class LengthRegulator(nn.Layer): ...@@ -35,10 +35,8 @@ class LengthRegulator(nn.Layer):
def __init__(self, pad_value=0.0): def __init__(self, pad_value=0.0):
"""Initilize length regulator module. """Initilize length regulator module.
Parameters Args:
---------- pad_value (float, optional): Value used for padding.
pad_value : float, optional
Value used for padding.
""" """
super().__init__() super().__init__()
...@@ -90,19 +88,13 @@ class LengthRegulator(nn.Layer): ...@@ -90,19 +88,13 @@ class LengthRegulator(nn.Layer):
def forward(self, xs, ds, alpha=1.0, is_inference=False): def forward(self, xs, ds, alpha=1.0, is_inference=False):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Args:
---------- xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
xs : Tensor ds (Tensor(int64)): Batch of durations of each frame (B, T).
Batch of sequences of char or phoneme embeddings (B, Tmax, D). alpha (float, optional): Alpha value to control speed of speech.
ds : Tensor(int64)
Batch of durations of each frame (B, T).
alpha : float, optional
Alpha value to control speed of speech.
Returns Returns:
---------- Tensor: replicated input tensor based on durations (B, T*, D).
Tensor
replicated input tensor based on durations (B, T*, D).
""" """
if alpha != 1.0: if alpha != 1.0:
......
...@@ -20,14 +20,10 @@ from paddle import nn ...@@ -20,14 +20,10 @@ from paddle import nn
class PositionwiseFeedForward(nn.Layer): class PositionwiseFeedForward(nn.Layer):
"""Positionwise feed forward layer. """Positionwise feed forward layer.
Parameters Args:
---------- idim (int): Input dimenstion.
idim : int hidden_units (int): The number of hidden units.
Input dimenstion. dropout_rate (float): Dropout rate.
hidden_units : int
The number of hidden units.
dropout_rate : float
Dropout rate.
""" """
def __init__(self, def __init__(self,
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册