change the docstring style from numpydoc to google, test=tts

9699c007 · 小湉湉 · 683679be · 9699c007 · 9699c007 · 9699c007
57 changed file
--- a/paddlespeech/t2s/datasets/data_table.py
+++ b/paddlespeech/t2s/datasets/data_table.py
@@ -22,26 +22,17 @@ from paddle.io import Dataset

 class DataTable(Dataset):
    """Dataset to load and convert data for general purpose.
-
-    Parameters
-    ----------
-    data : List[Dict[str, Any]]
-        Metadata, a list of meta datum, each of which is composed of 
-        several fields
-    fields : List[str], optional
-        Fields to use, if not specified, all the fields in the data are 
-        used, by default None
-    converters : Dict[str, Callable], optional
-        Converters used to process each field, by default None
-    use_cache : bool, optional
-        Whether to use cache, by default False
-
-    Raises
-    ------
-    ValueError
-        If there is some field that does not exist in data. 
-    ValueError
-        If there is some field in converters that does not exist in fields.
+    Args:
+        data (List[Dict[str, Any]]): Metadata, a list of meta datum, each of which is composed of  several fields
+        fields (List[str], optional): Fields to use, if not specified, all the fields in the data are used, by default None
+        converters (Dict[str, Callable], optional): Converters used to process each field, by default None
+        use_cache (bool, optional): Whether to use cache, by default False
+
+    Raises:
+        ValueError:
+            If there is some field that does not exist in data. 
+        ValueError:
+            If there is some field in converters that does not exist in fields.
    """

    def __init__(self,
@@ -95,15 +86,11 @@ class DataTable(Dataset):
        """Convert a meta datum to an example by applying the corresponding 
        converters to each fields requested.

-        Parameters
-        ----------
-        meta_datum : Dict[str, Any]
-            Meta datum
+        Args:
+            meta_datum (Dict[str, Any]): Meta datum

-        Returns
-        -------
-        Dict[str, Any]
-            Converted example
+        Returns:
+            Dict[str, Any]: Converted example
        """
        example = {}
        for field in self.fields:
@@ -118,16 +105,11 @@ class DataTable(Dataset):

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        """Get an example given an index.
+        Args:
+            idx (int): Index of the example to get

-        Parameters
-        ----------
-        idx : int
-            Index of the example to get
-
-        Returns
-        -------
-        Dict[str, Any]
-            A converted example
+        Returns:
+            Dict[str, Any]: A converted example
        """
        if self.use_cache and self.caches[idx] is not None:
            return self.caches[idx]

--- a/paddlespeech/t2s/datasets/preprocess_utils.py
+++ b/paddlespeech/t2s/datasets/preprocess_utils.py
@@ -18,14 +18,10 @@ import re
 def get_phn_dur(file_name):
    '''
    read MFA duration.txt
-    Parameters
-    ----------
-    file_name : str or Path
-        path of gen_duration_from_textgrid.py's result
-    Returns
-    ----------
-    Dict
-        sentence: {'utt': ([char], [int])}
+    Args:
+        file_name (str or Path): path of gen_duration_from_textgrid.py's result
+    Returns: 
+        Dict: sentence: {'utt': ([char], [int])}
    '''
    f = open(file_name, 'r')
    sentence = {}
@@ -48,10 +44,8 @@ def get_phn_dur(file_name):
 def merge_silence(sentence):
    '''
    merge silences
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': (([char], [int]), str)}
+    Args:
+        sentence (Dict): sentence: {'utt': (([char], [int]), str)}
    '''
    for utt in sentence:
        cur_phn, cur_dur, speaker = sentence[utt]
@@ -81,12 +75,9 @@ def merge_silence(sentence):
 def get_input_token(sentence, output_path, dataset="baker"):
    '''
    get phone set from training data and save it
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': ([char], [int])}
-    output_path : str or path
-        path to save phone_id_map
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], [int])}
+        output_path (str or path):path to save phone_id_map
    '''
    phn_token = set()
    for utt in sentence:
@@ -112,14 +103,10 @@ def get_phones_tones(sentence,
                     dataset="baker"):
    '''
    get phone set and tone set from training data and save it
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': ([char], [int])}
-    phones_output_path : str or path
-        path to save phone_id_map
-    tones_output_path : str or path
-        path to save tone_id_map
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], [int])}
+        phones_output_path (str or path): path to save phone_id_map
+        tones_output_path (str or path): path to save tone_id_map
    '''
    phn_token = set()
    tone_token = set()
@@ -162,14 +149,10 @@ def get_spk_id_map(speaker_set, output_path):
 def compare_duration_and_mel_length(sentences, utt, mel):
    '''
    check duration error, correct sentences[utt] if possible, else pop sentences[utt]
-    Parameters
-    ----------
-    sentences : Dict
-        sentences[utt] = [phones_list ,durations_list]
-    utt : str
-        utt_id
-    mel : np.ndarry
-        features (num_frames, n_mels)
+    Args:
+        sentences (Dict): sentences[utt] = [phones_list ,durations_list]
+        utt (str): utt_id
+        mel (np.ndarry): features (num_frames, n_mels)
    '''

    if utt in sentences:

--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@@ -29,15 +29,11 @@ class Clip(object):
            hop_size=256,
            aux_context_window=0, ):
        """Initialize customized collater for DataLoader.
+        Args:

-        Parameters
-        ----------
-        batch_max_steps : int
-            The maximum length of input signal in batch.
-        hop_size : int
-            Hop size of auxiliary features.
-        aux_context_window : int
-            Context window size for auxiliary feature conv.
+            batch_max_steps (int): The maximum length of input signal in batch.
+            hop_size (int): Hop size of auxiliary features.
+            aux_context_window (int): Context window size for auxiliary feature conv.

        """
        if batch_max_steps % hop_size != 0:
@@ -56,18 +52,15 @@ class Clip(object):
    def __call__(self, batch):
        """Convert into batch tensors.

-        Parameters
-        ----------
-        batch : list
-            list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
+        Args:
+            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).

-        Returns
-        ----------
-        Tensor
-            Auxiliary feature batch (B, C, T'), where
-            T = (T' - 2 * aux_context_window) * hop_size.
-        Tensor
-            Target signal batch (B, 1, T).
+        Returns: 
+            Tensor:
+                Auxiliary feature batch (B, C, T'), where
+                T = (T' - 2 * aux_context_window) * hop_size.
+            Tensor:
+                Target signal batch (B, 1, T).

        """
        # check length
@@ -104,11 +97,10 @@ class Clip(object):
    def _adjust_length(self, x, c):
        """Adjust the audio and feature lengths.

-        Note
-        -------
-        Basically we assume that the length of x and c are adjusted
-        through preprocessing stage, but if we use other library processed
-        features, this process will be needed.
+        Note:
+            Basically we assume that the length of x and c are adjusted
+            through preprocessing stage, but if we use other library processed
+            features, this process will be needed.

        """
        if len(x) < c.shape[0] * self.hop_size:
@@ -162,22 +154,14 @@ class WaveRNNClip(Clip):
        # voc_pad = 2  this will pad the input so that the resnet can 'see' wider than input length
        # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15
        """Convert into batch tensors.
-
-        Parameters
-        ----------
-        batch : list
-            list of tuple of the pair of audio and features. 
-            Audio shape (T, ), features shape(T', C).
-
-        Returns
-        ----------
-        Tensor
-            Input signal batch (B, 1, T).
-        Tensor
-            Target signal batch (B, 1, T).
-        Tensor
-            Auxiliary feature batch (B, C, T'), where
-            T = (T' - 2 * aux_context_window) * hop_size.
+        Args:
+            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
+
+        Returns:
+            Tensor: Input signal batch (B, 1, T).
+            Tensor: Target signal batch (B, 1, T).
+            Tensor: Auxiliary feature batch (B, C, T'), 
+                where T = (T' - 2 * aux_context_window) * hop_size.

        """
        # check length

--- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py
+++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
@@ -31,15 +31,12 @@ from paddlespeech.t2s.frontend import English


 def get_lj_sentences(file_name, frontend):
-    '''
-    read MFA duration.txt
-    Parameters
-    ----------
-    file_name : str or Path
-    Returns
-    ----------
-    Dict
-        sentence: {'utt': ([char], [int])}
+    '''read MFA duration.txt
+
+    Args:
+        file_name (str or Path)
+    Returns:
+        Dict: sentence: {'utt': ([char], [int])}
    '''
    f = open(file_name, 'r')
    sentence = {}
@@ -59,14 +56,11 @@ def get_lj_sentences(file_name, frontend):


 def get_input_token(sentence, output_path):
-    '''
-    get phone set from training data and save it
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': ([char], str)}
-    output_path : str or path
-        path to save phone_id_map
+    '''get phone set from training data and save it
+    
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], str)}
+        output_path (str or path): path to save phone_id_map
    '''
    phn_token = set()
    for utt in sentence:

--- a/paddlespeech/t2s/frontend/arpabet.py
+++ b/paddlespeech/t2s/frontend/arpabet.py
@@ -133,16 +133,11 @@ class ARPABET(Phonetics):

    def phoneticize(self, sentence, add_start_end=False):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
+        Args:
+            sentence (str): The input text sequence.
    
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns:
+            List[str]: The list of pronunciation sequence.
        """
        phonemes = [
            self._remove_vowels(item) for item in self.backend(sentence)
@@ -156,16 +151,12 @@ class ARPABET(Phonetics):

    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
+
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
    
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Returns:
+            List[int]: The list of pronunciation id sequence.
        """
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids
@@ -173,30 +164,23 @@ class ARPABET(Phonetics):
    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
        
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
+        Args:
+            ids( List[int]): The list of pronunciation id sequence.
    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns: 
+            List[str]: 
+                The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]

    def __call__(self, sentence, add_start_end=False):
        """ Convert the input text sequence into pronunciation id sequence.
    
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
+        Args:
+            sentence (str): The input text sequence.
    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Returns:
+            List[str]: The list of pronunciation id sequence.
        """
        return self.numericalize(
            self.phoneticize(sentence, add_start_end=add_start_end))
@@ -229,15 +213,11 @@ class ARPABETWithStress(Phonetics):
    def phoneticize(self, sentence, add_start_end=False):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
    
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
+        Args: 
+            sentence (str): The input text sequence.
    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        phonemes = self.backend(sentence)
        if add_start_end:
@@ -249,47 +229,33 @@ class ARPABETWithStress(Phonetics):

    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
+
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
    
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Returns:
+            List[int]: The list of pronunciation id sequence.
        """
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids

    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
+        Args:
+            ids (List[int]): The list of pronunciation id sequence.
    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]

    def __call__(self, sentence, add_start_end=False):
        """ Convert the input text sequence into pronunciation id sequence.
+        Args:
+            sentence (str): The input text sequence.
    
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation id sequence.
        """
        return self.numericalize(
            self.phoneticize(sentence, add_start_end=add_start_end))

--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -65,14 +65,10 @@ class English(Phonetics):

    def phoneticize(self, sentence):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        start = self.vocab.start_symbol
        end = self.vocab.end_symbol
@@ -123,14 +119,10 @@ class English(Phonetics):

    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
+        Returns: 
+            List[int]: The list of pronunciation id sequence.
        """
        ids = [
            self.vocab.lookup(item) for item in phonemes
@@ -140,27 +132,19 @@ class English(Phonetics):

    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+            ids (List[int]): The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]

    def __call__(self, sentence):
        """ Convert the input text sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Args:
+            sentence(str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation id sequence.
        """
        return self.numericalize(self.phoneticize(sentence))

@@ -183,28 +167,21 @@ class EnglishCharacter(Phonetics):

    def phoneticize(self, sentence):
        """ Normalize the input text sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        str
-            A text sequence after normalize.
+        Args:
+            sentence(str): The input text sequence.
+        Returns:
+            str: A text sequence after normalize.
        """
        words = normalize(sentence)
        return words

    def numericalize(self, sentence):
        """ Convert a text sequence into ids.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[int]
-            List of a character id sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns:
+            List[int]:
+                List of a character id sequence.
        """
        ids = [
            self.vocab.lookup(item) for item in sentence
@@ -214,27 +191,19 @@ class EnglishCharacter(Phonetics):

    def reverse(self, ids):
        """ Convert a character id sequence into text.
-        Parameters
-        -----------
-        ids: List[int]
-            List of a character id sequence.
-        Returns
-        ----------
-        str
-            The input text sequence.
+        Args:
+            ids (List[int]): List of a character id sequence.
+        Returns:
+            str: The input text sequence.
        """
        return [self.vocab.reverse(i) for i in ids]

    def __call__(self, sentence):
        """ Normalize the input text sequence and convert it into character id sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[int]
-            List of a character id sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns: 
+            List[int]: List of a character id sequence.
        """
        return self.numericalize(self.phoneticize(sentence))

@@ -264,14 +233,10 @@ class Chinese(Phonetics):

    def phoneticize(self, sentence):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+            sentence(str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        # simplified = self.opencc_backend.convert(sentence)
        simplified = sentence
@@ -296,28 +261,20 @@ class Chinese(Phonetics):

    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Args:
+            phonemes(List[str]): The list of pronunciation sequence.
+        Returns:
+                List[int]: The list of pronunciation id sequence.
        """
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids

    def __call__(self, sentence):
        """ Convert the input text sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns:
+            List[str]: The list of pronunciation id sequence.
        """
        return self.numericalize(self.phoneticize(sentence))

@@ -329,13 +286,9 @@ class Chinese(Phonetics):

    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+        ids (List[int]): The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]
--- a/paddlespeech/t2s/frontend/vocab.py
+++ b/paddlespeech/t2s/frontend/vocab.py
@@ -20,22 +20,12 @@ __all__ = ["Vocab"]
 class Vocab(object):
    """  Vocabulary.

-    Parameters
-    -----------
-    symbols: Iterable[str]
-        Common symbols.
-
-    padding_symbol: str, optional
-        Symbol for pad. Defaults to "<pad>".
-
-    unk_symbol: str, optional
-        Symbol for unknow. Defaults to "<unk>"
-
-    start_symbol: str, optional
-        Symbol for start. Defaults to "<s>"
-
-    end_symbol: str, optional
-        Symbol for end. Defaults to "</s>"
+    Args:
+        symbols (Iterable[str]): Common symbols.
+        padding_symbol (str, optional): Symbol for pad. Defaults to "<pad>".
+        unk_symbol (str, optional): Symbol for unknow. Defaults to "<unk>"
+        start_symbol (str, optional): Symbol for start. Defaults to "<s>"
+        end_symbol (str, optional): Symbol for end. Defaults to "</s>"
    """

    def __init__(self,

--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@@ -44,12 +44,10 @@ RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'

 def replace_time(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """

    is_range = len(match.groups()) > 5
@@ -87,12 +85,10 @@ RE_DATE = re.compile(r'(\d{4}|\d{2})年'

 def replace_date(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    year = match.group(1)
    month = match.group(3)
@@ -114,12 +110,10 @@ RE_DATE2 = re.compile(

 def replace_date2(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    year = match.group(1)
    month = match.group(3)

--- a/paddlespeech/t2s/frontend/zh_normalization/num.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/num.py
@@ -36,12 +36,10 @@ RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')

 def replace_frac(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    sign = match.group(1)
    nominator = match.group(2)
@@ -59,12 +57,10 @@ RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')

 def replace_percentage(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    sign = match.group(1)
    percent = match.group(2)
@@ -81,12 +77,10 @@ RE_INTEGER = re.compile(r'(-)' r'(\d+)')

 def replace_negative_num(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    sign = match.group(1)
    number = match.group(2)
@@ -103,12 +97,10 @@ RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')

 def replace_default_num(match):
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    number = match.group(0)
    return verbalize_digit(number)
@@ -124,12 +116,10 @@ RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')

 def replace_positive_quantifier(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    number = match.group(1)
    match_2 = match.group(2)
@@ -142,12 +132,10 @@ def replace_positive_quantifier(match) -> str:

 def replace_number(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    sign = match.group(1)
    number = match.group(2)
@@ -169,12 +157,10 @@ RE_RANGE = re.compile(

 def replace_range(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    first, second = match.group(1), match.group(8)
    first = RE_NUMBER.sub(replace_number, first)

--- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
@@ -45,23 +45,19 @@ def phone2str(phone_string: str, mobile=True) -> str:

 def replace_phone(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    return phone2str(match.group(0), mobile=False)


 def replace_mobile(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    return phone2str(match.group(0))
--- a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
@@ -22,12 +22,10 @@ RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')

 def replace_temperature(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    sign = match.group(1)
    temperature = match.group(2)

--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@@ -55,14 +55,10 @@ class TextNormalizer():

    def _split(self, text: str, lang="zh") -> List[str]:
        """Split long text into sentences with sentence-splitting punctuations.
-        Parameters
-        ----------
-        text : str
-            The input text.
-        Returns
-        -------
-        List[str]
-            Sentences.
+        Args:
+            text (str): The input text.
+        Returns:
+            List[str]: Sentences.
        """
        # Only for pure Chinese here
        if lang == "zh":

--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
--- a/paddlespeech/t2s/models/hifigan/hifigan.py
+++ b/paddlespeech/t2s/models/hifigan/hifigan.py
@@ -37,35 +37,21 @@ class HiFiGANGenerator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initialize HiFiGANGenerator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        channels : int
-            Number of hidden representation channels.
-        kernel_size : int
-            Kernel size of initial and final conv layer.
-        upsample_scales : list
-            List of upsampling scales.
-        upsample_kernel_sizes : list
-            List of kernel sizes for upsampling layers.
-        resblock_kernel_sizes : list
-            List of kernel sizes for residual blocks.
-        resblock_dilations : list
-            List of dilation list for residual blocks.
-        use_additional_convs : bool
-            Whether to use additional conv layers in residual blocks.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            channels (int): Number of hidden representation channels.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            upsample_scales (list): List of upsampling scales.
+            upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
+            resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
+            resblock_dilations (list): List of dilation list for residual blocks.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()

@@ -134,14 +120,11 @@ class HiFiGANGenerator(nn.Layer):

    def forward(self, c):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T).
+        
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T).
        """
        c = self.input_conv(c)
        for i in range(self.num_upsamples):
@@ -196,15 +179,12 @@ class HiFiGANGenerator(nn.Layer):

    def inference(self, c):
        """Perform inference.
-        Parameters
-        ----------
-        c : Tensor 
-            Input tensor (T, in_channels).
-            normalize_before (bool): Whether to perform normalization.
-        Returns
-        ----------
-        Tensor
-            Output tensor (T ** prod(upsample_scales), out_channels).
+        Args:
+            c (Tensor): Input tensor (T, in_channels).
+                normalize_before (bool): Whether to perform normalization.
+        Returns:
+            Tensor:
+                Output tensor (T ** prod(upsample_scales), out_channels).
        """
        c = self.forward(c.transpose([1, 0]).unsqueeze(0))
        return c.squeeze(0).transpose([1, 0])
@@ -229,36 +209,23 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
            use_spectral_norm: bool=False,
            init_type: str="xavier_uniform", ):
        """Initialize HiFiGANPeriodDiscriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        period : int
-            Period.
-        kernel_sizes : list
-            Kernel sizes of initial conv layers and the final conv layer.
-        channels : int
-            Number of initial channels.
-        downsample_scales : list
-            List of downsampling scales.
-        max_downsample_channels : int
-            Number of maximum downsampling channels.
-        use_additional_convs : bool
-            Whether to use additional conv layers in residual blocks.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
-        use_spectral_norm : bool
-            Whether to use spectral norm.
-            If set to true, it will be applied to all of the conv layers.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            period (int): Period.
+            kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer.
+            channels (int): Number of initial channels.
+            downsample_scales (list): List of downsampling scales.
+            max_downsample_channels (int): Number of maximum downsampling channels.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_spectral_norm (bool): Whether to use spectral norm.
+                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()

@@ -307,14 +274,11 @@ class HiFiGANPeriodDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        list
-            List of each layer's tensors.
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            list: List of each layer's tensors.
        """
        # transform 1d to 2d -> (B, C, T/P, P)
        b, c, t = paddle.shape(x)
@@ -379,13 +343,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
            },
            init_type: str="xavier_uniform", ):
        """Initialize HiFiGANMultiPeriodDiscriminator module.
-        Parameters
-        ----------
-        periods : list
-            List of periods.
-        discriminator_params : dict
-            Parameters for hifi-gan period discriminator module.
-            The period parameter will be overwritten.
+
+        Args:
+            periods (list): List of periods.
+            discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+                The period parameter will be overwritten.
        """
        super().__init__()
        # initialize parameters
@@ -399,14 +361,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of list of each discriminator outputs, which consists of each layer output tensors.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        """
        outs = []
        for f in self.discriminators:
@@ -434,33 +393,22 @@ class HiFiGANScaleDiscriminator(nn.Layer):
            use_spectral_norm: bool=False,
            init_type: str="xavier_uniform", ):
        """Initilize HiFiGAN scale discriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        kernel_sizes : list
-            List of four kernel sizes. The first will be used for the first conv layer,
-            and the second is for downsampling part, and the remaining two are for output layers.
-        channels : int
-            Initial number of channels for conv layer.
-        max_downsample_channels : int
-            Maximum number of channels for downsampling layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        downsample_scales : list
-            List of downsampling scales.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
-        use_spectral_norm : bool
-            Whether to use spectral norm.
-            If set to true, it will be applied to all of the conv layers.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer,
+                and the second is for downsampling part, and the remaining two are for output layers.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (list): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_spectral_norm (bool): Whether to use spectral norm.
+                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()

@@ -546,14 +494,11 @@ class HiFiGANScaleDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of output tensors of each layer.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of output tensors of each layer.
        """
        outs = []
        for f in self.layers:
@@ -613,20 +558,14 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):
            follow_official_norm: bool=False,
            init_type: str="xavier_uniform", ):
        """Initilize HiFiGAN multi-scale discriminator module.
-        Parameters
-        ----------
-        scales : int
-            Number of multi-scales.
-        downsample_pooling : str
-            Pooling module name for downsampling of the inputs.
-        downsample_pooling_params : dict
-            Parameters for the above pooling module.
-        discriminator_params : dict
-            Parameters for hifi-gan scale discriminator module.
-        follow_official_norm : bool
-            Whether to follow the norm setting of the official
-            implementaion. The first discriminator uses spectral norm and the other
-            discriminators use weight norm.
+   
+        Args:
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            downsample_pooling_params (dict): Parameters for the above pooling module.
+            discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+            follow_official_norm (bool): Whether to follow the norm setting of the official
+                implementaion. The first discriminator uses spectral norm and the other discriminators use weight norm.
        """
        super().__init__()

@@ -651,14 +590,11 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of list of each discriminator outputs, which consists of each layer output tensors.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        """
        outs = []
        for f in self.discriminators:
@@ -715,24 +651,17 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
            },
            init_type: str="xavier_uniform", ):
        """Initilize HiFiGAN multi-scale + multi-period discriminator module.
-        Parameters
-        ----------
-        scales : int
-            Number of multi-scales.
-        scale_downsample_pooling : str
-            Pooling module name for downsampling of the inputs.
-        scale_downsample_pooling_params : dict
-            Parameters for the above pooling module.
-        scale_discriminator_params : dict
-            Parameters for hifi-gan scale discriminator module.
-        follow_official_norm : bool): Whether to follow the norm setting of the official
-            implementaion. The first discriminator uses spectral norm and the other
-            discriminators use weight norm.
-        periods : list
-            List of periods.
-        period_discriminator_params : dict
-            Parameters for hifi-gan period discriminator module.
-            The period parameter will be overwritten.
+
+        Args:
+            scales (int): Number of multi-scales.
+            scale_downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            scale_downsample_pooling_params (dict): Parameters for the above pooling module.
+            scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+            follow_official_norm （bool): Whether to follow the norm setting of the official implementaion. 
+                The first discriminator uses spectral norm and the other discriminators use weight norm.
+            periods (list): List of periods.
+            period_discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+                The period parameter will be overwritten.
        """
        super().__init__()

@@ -751,16 +680,14 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List:
-            List of list of each discriminator outputs,
-            which consists of each layer output tensors.
-            Multi scale and multi period ones are concatenated.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List:
+                List of list of each discriminator outputs,
+                which consists of each layer output tensors.
+                Multi scale and multi period ones are concatenated.
        """
        msd_outs = self.msd(x)
        mpd_outs = self.mpd(x)

--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
@@ -51,41 +51,26 @@ class MelGANGenerator(nn.Layer):
            use_causal_conv: bool=False,
            init_type: str="xavier_uniform", ):
        """Initialize MelGANGenerator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels,
-            the number of sub-band is out_channels in multi-band melgan.
-        kernel_size : int
-            Kernel size of initial and final conv layer.
-        channels : int
-            Initial number of channels for conv layer.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        upsample_scales : List[int]
-            List of upsampling scales.
-        stack_kernel_size : int
-            Kernel size of dilated conv layers in residual stack.
-        stacks : int
-            Number of stacks in a single residual stack.
-        nonlinear_activation : Optional[str], optional
-            Non linear activation in upsample network, by default None
-        nonlinear_activation_params : Dict[str, Any], optional
-            Parameters passed to the linear activation in the upsample network, 
-            by default {}
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : dict
-            Hyperparameters for padding function.
-        use_final_nonlinear_activation : nn.Layer
-            Activation function for the final layer.
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
-        use_causal_conv : bool
-            Whether to use causal convolution.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels,
+                the number of sub-band is out_channels in multi-band melgan.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            channels (int): Initial number of channels for conv layer.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            upsample_scales (List[int]): List of upsampling scales.
+            stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
+            stacks (int): Number of stacks in a single residual stack.
+            nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+            nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
+                by default {}
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params （dict): Hyperparameters for padding function.
+            use_final_nonlinear_activation (nn.Layer): Activation function for the final layer.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_causal_conv (bool): Whether to use causal convolution.
        """
        super().__init__()

@@ -207,14 +192,11 @@ class MelGANGenerator(nn.Layer):

    def forward(self, c):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T ** prod(upsample_scales)).
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
        """
        out = self.melgan(c)
        return out
@@ -260,14 +242,11 @@ class MelGANGenerator(nn.Layer):

    def inference(self, c):
        """Perform inference.
-        Parameters
-        ----------
-        c : Union[Tensor, ndarray]
-            Input tensor (T, in_channels).
-        Returns
-        ----------
-        Tensor
-            Output tensor (out_channels*T ** prod(upsample_scales), 1).
+
+        Args:
+            c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
+        Returns:
+            Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1).
        """
        # pseudo batch
        c = c.transpose([1, 0]).unsqueeze(0)
@@ -298,33 +277,22 @@ class MelGANDiscriminator(nn.Layer):
            pad_params: Dict[str, Any]={"mode": "reflect"},
            init_type: str="xavier_uniform", ):
        """Initilize MelGAN discriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        kernel_sizes : List[int]
-            List of two kernel sizes. The prod will be used for the first conv layer,
-            and the first and the second kernel sizes will be used for the last two layers.
-            For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
-            the last two layers' kernel size will be 5 and 3, respectively.
-        channels : int
-            Initial number of channels for conv layer.
-        max_downsample_channels : int
-            Maximum number of channels for downsampling layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        downsample_scales : List[int]
-            List of downsampling scales.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : dict
-            Hyperparameters for padding function.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer,
+                and the first and the second kernel sizes will be used for the last two layers.
+                For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
+                the last two layers' kernel size will be 5 and 3, respectively.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (List[int]): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
        """
        super().__init__()

@@ -395,14 +363,10 @@ class MelGANDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of output tensors of each layer (for feat_match_loss).
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of output tensors of each layer (for feat_match_loss).
        """
        outs = []
        for f in self.layers:
@@ -440,39 +404,24 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initilize MelGAN multi-scale discriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        scales : int
-            Number of multi-scales.
-        downsample_pooling : str
-            Pooling module name for downsampling of the inputs.
-        downsample_pooling_params : dict
-            Parameters for the above pooling module.
-        kernel_sizes : List[int]
-            List of two kernel sizes. The sum will be used for the first conv layer,
-            and the first and the second kernel sizes will be used for the last two layers.
-        channels : int
-            Initial number of channels for conv layer.
-        max_downsample_channels : int
-            Maximum number of channels for downsampling layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        downsample_scales : List[int]
-            List of downsampling scales.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : dict
-            Hyperparameters for padding function.
-        use_causal_conv : bool
-            Whether to use causal convolution.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            downsample_pooling_params (dict): Parameters for the above pooling module.
+            kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer,
+                and the first and the second kernel sizes will be used for the last two layers.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (List[int]): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_causal_conv (bool): Whether to use causal convolution.
        """
        super().__init__()

@@ -514,14 +463,10 @@ class MelGANMultiScaleDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of list of each discriminator outputs, which consists of each layer output tensors.
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        """
        outs = []
        for f in self.discriminators:

--- a/paddlespeech/t2s/models/melgan/style_melgan.py
+++ b/paddlespeech/t2s/models/melgan/style_melgan.py
@@ -52,37 +52,23 @@ class StyleMelGANGenerator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initilize Style MelGAN generator.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input noise channels.
-        aux_channels : int
-            Number of auxiliary input channels.
-        channels : int
-            Number of channels for conv layer.
-        out_channels : int
-            Number of output channels.
-        kernel_size : int
-            Kernel size of conv layers.
-        dilation : int
-            Dilation factor for conv layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        noise_upsample_scales : list
-            List of noise upsampling scales.
-        noise_upsample_activation : str
-            Activation function module name for noise upsampling.
-        noise_upsample_activation_params : dict
-            Hyperparameters for the above activation function.
-        upsample_scales : list
-            List of upsampling scales.
-        upsample_mode : str
-            Upsampling mode in TADE layer.
-        gated_function : str
-            Gated function in TADEResBlock ("softmax" or "sigmoid").
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
+
+        Args:
+            in_channels (int): Number of input noise channels.
+            aux_channels (int): Number of auxiliary input channels.
+            channels (int): Number of channels for conv layer.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of conv layers.
+            dilation (int): Dilation factor for conv layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            noise_upsample_scales (list): List of noise upsampling scales.
+            noise_upsample_activation (str): Activation function module name for noise upsampling.
+            noise_upsample_activation_params (dict): Hyperparameters for the above activation function.
+            upsample_scales (list): List of upsampling scales.
+            upsample_mode (str): Upsampling mode in TADE layer.
+            gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid").
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()

@@ -147,16 +133,12 @@ class StyleMelGANGenerator(nn.Layer):

    def forward(self, c, z=None):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Auxiliary input tensor (B, channels, T).
-        z : Tensor
-            Input noise tensor (B, in_channels, 1).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T ** prod(upsample_scales)).
+
+        Args:
+            c (Tensor): Auxiliary input tensor (B, channels, T).
+            z (Tensor): Input noise tensor (B, in_channels, 1).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
        """
        # batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300)
        if z is None:
@@ -211,14 +193,10 @@ class StyleMelGANGenerator(nn.Layer):

    def inference(self, c):
        """Perform inference.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (T, in_channels).
-        Returns
-        ----------
-        Tensor
-            Output tensor (T ** prod(upsample_scales), out_channels).
+        Args:
+            c (Tensor): Input tensor (T, in_channels).
+        Returns:
+            Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
        """
        # (1, in_channels, T)
        c = c.transpose([1, 0]).unsqueeze(0)
@@ -278,18 +256,13 @@ class StyleMelGANDiscriminator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initilize Style MelGAN discriminator.
-        Parameters
-        ----------
-        repeats : int
-            Number of repititons to apply RWD.
-        window_sizes : list
-            List of random window sizes.
-        pqmf_params : list
-            List of list of Parameters for PQMF modules
-        discriminator_params : dict
-            Parameters for base discriminator module.
-        use_weight_nom : bool
-            Whether to apply weight normalization.
+
+        Args:
+            repeats (int): Number of repititons to apply RWD.
+            window_sizes (list): List of random window sizes.
+            pqmf_params (list): List of list of Parameters for PQMF modules
+            discriminator_params (dict): Parameters for base discriminator module.
+            use_weight_nom (bool): Whether to apply weight normalization.
        """
        super().__init__()

@@ -325,15 +298,11 @@ class StyleMelGANDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, 1, T).
-        Returns
-        ----------
-        List
-            List of discriminator outputs, #items in the list will be
-            equal to repeats * #discriminators.
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+        Returns:
+            List: List of discriminator outputs, #items in the list will be
+                equal to repeats * #discriminators.
        """
        outs = []
        for _ in range(self.repeats):

--- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
+++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
@@ -31,51 +31,30 @@ from paddlespeech.t2s.modules.upsample import ConvInUpsampleNet
 class PWGGenerator(nn.Layer):
    """Wave Generator for Parallel WaveGAN

-    Parameters
-    ----------
-    in_channels : int, optional
-        Number of channels of the input waveform, by default 1
-    out_channels : int, optional
-        Number of channels of the output waveform, by default 1
-    kernel_size : int, optional
-        Kernel size of the residual blocks inside, by default 3
-    layers : int, optional
-        Number of residual blocks inside, by default 30
-    stacks : int, optional
-        The number of groups to split the residual blocks into, by default 3
-        Within each group, the dilation of the residual block grows 
-        exponentially.
-    residual_channels : int, optional
-        Residual channel of the residual blocks, by default 64
-    gate_channels : int, optional
-        Gate channel of the residual blocks, by default 128
-    skip_channels : int, optional
-        Skip channel of the residual blocks, by default 64
-    aux_channels : int, optional
-        Auxiliary channel of the residual blocks, by default 80
-    aux_context_window : int, optional
-        The context window size of the first convolution applied to the 
-        auxiliary input, by default 2
-    dropout : float, optional
-        Dropout of the residual blocks, by default 0.
-    bias : bool, optional
-        Whether to use bias in residual blocks, by default True
-    use_weight_norm : bool, optional
-        Whether to use weight norm in all convolutions, by default True
-    use_causal_conv : bool, optional
-        Whether to use causal padding in the upsample network and residual 
-        blocks, by default False
-    upsample_scales : List[int], optional
-        Upsample scales of the upsample network, by default [4, 4, 4, 4]
-    nonlinear_activation : Optional[str], optional
-        Non linear activation in upsample network, by default None
-    nonlinear_activation_params : Dict[str, Any], optional
-        Parameters passed to the linear activation in the upsample network, 
-        by default {}
-    interpolate_mode : str, optional
-        Interpolation mode of the upsample network, by default "nearest"
-    freq_axis_kernel_size : int, optional
-        Kernel size along the frequency axis of the upsample network, by default 1
+    Args:
+        in_channels (int, optional): Number of channels of the input waveform, by default 1
+        out_channels (int, optional): Number of channels of the output waveform, by default 1
+        kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3
+        layers (int, optional): Number of residual blocks inside, by default 30
+        stacks (int, optional): The number of groups to split the residual blocks into, by default 3
+            Within each group, the dilation of the residual block grows exponentially.
+        residual_channels (int, optional): Residual channel of the residual blocks, by default 64
+        gate_channels (int, optional): Gate channel of the residual blocks, by default 128
+        skip_channels (int, optional): Skip channel of the residual blocks, by default 64
+        aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80
+        aux_context_window (int, optional): The context window size of the first convolution applied to the 
+            auxiliary input, by default 2
+        dropout (float, optional): Dropout of the residual blocks, by default 0.
+        bias (bool, optional): Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True
+        use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual 
+            blocks, by default False
+        upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4]
+        nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
+            by default {}
+        interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest"
+        freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1
    """

    def __init__(
@@ -167,18 +146,13 @@ class PWGGenerator(nn.Layer):
    def forward(self, x, c):
        """Generate waveform.

-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, C_in, T), The input waveform.
-        c : Tensor
-            Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It 
+        Args:
+            x(Tensor): Shape (N, C_in, T), The input waveform.
+            c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It
            is upsampled to match the time resolution of the input.

-        Returns
-        -------
-        Tensor
-            Shape (N, C_out, T), the generated waveform.
+        Returns:
+            Tensor: Shape (N, C_out, T), the generated waveform.
        """
        c = self.upsample_net(c)
        assert c.shape[-1] == x.shape[-1]
@@ -218,19 +192,14 @@ class PWGGenerator(nn.Layer):
        self.apply(_remove_weight_norm)

    def inference(self, c=None):
-        """Waveform generation. This function is used for single instance 
-        inference.
-        Parameters
-        ----------
-        c : Tensor, optional
-            Shape (T', C_aux), the auxiliary input, by default None
-        x : Tensor, optional
-            Shape (T, C_in), the noise waveform, by default None
-            If not provided, a sample is drawn from a gaussian distribution.
-        Returns
-        -------
-        Tensor
-            Shape (T, C_out), the generated waveform
+        """Waveform generation. This function is used for single instance inference.
+
+        Args:
+            c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None
+            x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None
+
+        Returns:
+            Tensor: Shape (T, C_out), the generated waveform
        """
        # when to static, can not input x, see https://github.com/PaddlePaddle/Parakeet/pull/132/files
        x = paddle.randn(
@@ -244,32 +213,21 @@ class PWGGenerator(nn.Layer):
 class PWGDiscriminator(nn.Layer):
    """A convolutional discriminator for audio.

-    Parameters
-    ----------
-    in_channels : int, optional
-        Number of channels of the input audio, by default 1
-    out_channels : int, optional
-        Output feature size, by default 1
-    kernel_size : int, optional
-        Kernel size of convolutional sublayers, by default 3
-    layers : int, optional
-        Number of layers, by default 10
-    conv_channels : int, optional
-        Feature size of the convolutional sublayers, by default 64
-    dilation_factor : int, optional
-        The factor with which dilation of each convolutional sublayers grows 
-        exponentially if it is greater than 1, else the dilation of each 
-        convolutional sublayers grows linearly, by default 1
-    nonlinear_activation : str, optional
-        The activation after each convolutional sublayer, by default "leakyrelu"
-    nonlinear_activation_params : Dict[str, Any], optional
-        The parameters passed to the activation's initializer, by default 
-        {"negative_slope": 0.2}
-    bias : bool, optional
-        Whether to use bias in convolutional sublayers, by default True
-    use_weight_norm : bool, optional
-        Whether to use weight normalization at all convolutional sublayers, 
-        by default True
+    Args:
+        in_channels (int, optional): Number of channels of the input audio, by default 1
+        out_channels (int, optional): Output feature size, by default 1
+        kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3
+        layers (int, optional): Number of layers, by default 10
+        conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64
+        dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows 
+            exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly, 
+            by default 1
+        nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu"
+        nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default 
+            {"negative_slope": 0.2}
+        bias (bool, optional): Whether to use bias in convolutional sublayers, by default True
+        use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers, 
+            by default True
    """

    def __init__(
@@ -330,15 +288,12 @@ class PWGDiscriminator(nn.Layer):

    def forward(self, x):
        """
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, in_channels, num_samples), the input audio.
-
-        Returns
-        -------
-        Tensor
-            Shape (N, out_channels, num_samples), the predicted logits.
+
+        Args:
+            x (Tensor): Shape (N, in_channels, num_samples), the input audio.
+
+        Returns:
+            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
        """
        return self.conv_layers(x)

@@ -362,39 +317,25 @@ class PWGDiscriminator(nn.Layer):
 class ResidualPWGDiscriminator(nn.Layer):
    """A wavenet-style discriminator for audio.

-    Parameters
-    ----------
-    in_channels : int, optional
-        Number of channels of the input audio, by default 1
-    out_channels : int, optional
-        Output feature size, by default 1
-    kernel_size : int, optional
-        Kernel size of residual blocks, by default 3
-    layers : int, optional
-        Number of residual blocks, by default 30
-    stacks : int, optional
-        Number of groups of residual blocks, within which the dilation 
-        of each residual blocks grows exponentially, by default 3
-    residual_channels : int, optional
-        Residual channels of residual blocks, by default 64
-    gate_channels : int, optional
-        Gate channels of residual blocks, by default 128
-    skip_channels : int, optional
-        Skip channels of residual blocks, by default 64
-    dropout : float, optional
-        Dropout probability of residual blocks, by default 0.
-    bias : bool, optional
-        Whether to use bias in residual blocks, by default True
-    use_weight_norm : bool, optional
-        Whether to use weight normalization in all convolutional layers, 
-        by default True
-    use_causal_conv : bool, optional
-        Whether to use causal convolution in residual blocks, by default False
-    nonlinear_activation : str, optional
-        Activation after convolutions other than those in residual blocks, 
-        by default "leakyrelu"
-    nonlinear_activation_params : Dict[str, Any], optional
-        Parameters to pass to the activation, by default {"negative_slope": 0.2}
+    Args:
+        in_channels (int, optional): Number of channels of the input audio, by default 1
+        out_channels (int, optional): Output feature size, by default 1
+        kernel_size (int, optional): Kernel size of residual blocks, by default 3
+        layers (int, optional): Number of residual blocks, by default 30
+        stacks (int, optional): Number of groups of residual blocks, within which the dilation 
+            of each residual blocks grows exponentially, by default 3
+        residual_channels (int, optional): Residual channels of residual blocks, by default 64
+        gate_channels (int, optional): Gate channels of residual blocks, by default 128
+        skip_channels (int, optional): Skip channels of residual blocks, by default 64
+        dropout (float, optional): Dropout probability of residual blocks, by default 0.
+        bias (bool, optional): Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers, 
+            by default True
+        use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False
+        nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks, 
+            by default "leakyrelu"
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation, 
+            by default {"negative_slope": 0.2}
    """

    def __init__(
@@ -463,15 +404,11 @@ class ResidualPWGDiscriminator(nn.Layer):

    def forward(self, x):
        """
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, in_channels, num_samples), the input audio.
-
-        Returns
-        -------
-        Tensor
-            Shape (N, out_channels, num_samples), the predicted logits.
+        Args:
+            x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩
+
+        Returns:
+            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
        """
        x = self.first_conv(x)
        skip = 0

--- a/paddlespeech/t2s/models/tacotron2/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py
@@ -81,69 +81,39 @@ class Tacotron2(nn.Layer):
            # training related
            init_type: str="xavier_uniform", ):
        """Initialize Tacotron2 module.
-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        odim : int
-            Dimension of the outputs.
-        embed_dim : int
-            Dimension of the token embedding.
-        elayers : int
-            Number of encoder blstm layers.
-        eunits : int
-            Number of encoder blstm units.
-        econv_layers : int
-            Number of encoder conv layers.
-        econv_filts : int
-            Number of encoder conv filter size.
-        econv_chans : int
-            Number of encoder conv filter channels.
-        dlayers : int
-            Number of decoder lstm layers.
-        dunits : int
-            Number of decoder lstm units.
-        prenet_layers : int
-            Number of prenet layers.
-        prenet_units : int
-            Number of prenet units.
-        postnet_layers : int
-            Number of postnet layers.
-        postnet_filts : int
-            Number of postnet filter size.
-        postnet_chans : int
-            Number of postnet filter channels.
-        output_activation : str
-            Name of activation function for outputs.
-        adim : int
-            Number of dimension of mlp in attention.
-        aconv_chans : int
-            Number of attention conv filter channels.
-        aconv_filts : int
-            Number of attention conv filter size.
-        cumulate_att_w : bool
-            Whether to cumulate previous attention weight.
-        use_batch_norm : bool
-            Whether to use batch normalization.
-        use_concate : bool
-            Whether to concat enc outputs w/ dec lstm outputs.
-        reduction_factor : int
-            Reduction factor.
-        spk_num : Optional[int]
-            Number of speakers. If set to > 1, assume that the
-            sids will be provided as the input and use sid embedding layer.
-        lang_num : Optional[int]
-            Number of languages. If set to > 1, assume that the
-            lids will be provided as the input and use sid embedding layer.
-        spk_embed_dim : Optional[int]
-            Speaker embedding dimension. If set to > 0,
-            assume that spk_emb will be provided as the input.
-        spk_embed_integration_type : str
-            How to integrate speaker embedding.
-        dropout_rate : float
-            Dropout rate.
-        zoneout_rate : float
-            Zoneout rate.
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            embed_dim (int): Dimension of the token embedding.
+            elayers (int): Number of encoder blstm layers.
+            eunits (int): Number of encoder blstm units.
+            econv_layers (int): Number of encoder conv layers.
+            econv_filts (int): Number of encoder conv filter size.
+            econv_chans (int): Number of encoder conv filter channels.
+            dlayers (int): Number of decoder lstm layers.
+            dunits (int): Number of decoder lstm units.
+            prenet_layers (int): Number of prenet layers.
+            prenet_units (int): Number of prenet units.
+            postnet_layers (int): Number of postnet layers.
+            postnet_filts (int): Number of postnet filter size.
+            postnet_chans (int): Number of postnet filter channels.
+            output_activation (str): Name of activation function for outputs.
+            adim (int): Number of dimension of mlp in attention.
+            aconv_chans (int): Number of attention conv filter channels.
+            aconv_filts (int): Number of attention conv filter size.
+            cumulate_att_w (bool): Whether to cumulate previous attention weight.
+            use_batch_norm (bool): Whether to use batch normalization.
+            use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs.
+            reduction_factor (int): Reduction factor.
+            spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the
+                sids will be provided as the input and use sid embedding layer.
+            lang_num (Optional[int]): Number of languages. If set to > 1, assume that the
+                lids will be provided as the input and use sid embedding layer.
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
+                assume that spk_emb will be provided as the input.
+            spk_embed_integration_type (str): How to integrate speaker embedding.
+            dropout_rate (float): Dropout rate.
+            zoneout_rate (float): Zoneout rate.
        """
        assert check_argument_types()
        super().__init__()
@@ -258,31 +228,19 @@ class Tacotron2(nn.Layer):
    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
        """Calculate forward propagation.

-        Parameters
-        ----------
-        text : Tensor(int64)
-            Batch of padded character ids (B, T_text).
-        text_lengths : Tensor(int64)
-            Batch of lengths of each input batch (B,).
-        speech : Tensor
-            Batch of padded target features (B, T_feats, odim).
-        speech_lengths : Tensor(int64)
-            Batch of the lengths of each target (B,).
-        spk_emb : Optional[Tensor]
-            Batch of speaker embeddings (B, spk_embed_dim).
-        spk_id : Optional[Tensor]
-            Batch of speaker IDs (B, 1).
-        lang_id : Optional[Tensor]
-            Batch of language IDs (B, 1).
-
-        Returns
-        ----------
-        Tensor
-            Loss scalar value.
-        Dict
-            Statistics to be monitored.
-        Tensor
-            Weight value if not joint training else model outputs.
+        Args:
+            text (Tensor(int64)): Batch of padded character ids (B, T_text).
+            text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,).
+            speech (Tensor): Batch of padded target features (B, T_feats, odim).
+            speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,).
+            spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
+            spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1).
+            lang_id (Optional[Tensor]): Batch of language IDs (B, 1).
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
+            Tensor: Weight value if not joint training else model outputs.

        """
        text = text[:, :text_lengths.max()]
@@ -369,40 +327,26 @@ class Tacotron2(nn.Layer):
            use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
        """Generate the sequence of features given the sequences of characters.

-        Parameters
-        ----------
-        text Tensor(int64)
-            Input sequence of characters (T_text,).
-        speech : Optional[Tensor]
-            Feature sequence to extract style (N, idim).
-        spk_emb : ptional[Tensor]
-            Speaker embedding (spk_embed_dim,).
-        spk_id : Optional[Tensor]
-            Speaker ID (1,).
-        lang_id : Optional[Tensor]
-            Language ID (1,).
-        threshold : float
-            Threshold in inference.
-        minlenratio : float
-            Minimum length ratio in inference.
-        maxlenratio : float
-            Maximum length ratio in inference.
-        use_att_constraint : bool
-            Whether to apply attention constraint.
-        backward_window : int
-            Backward window in attention constraint.
-        forward_window : int
-            Forward window in attention constraint.
-        use_teacher_forcing : bool
-            Whether to use teacher forcing.
-
-        Return
-        ----------
-        Dict[str, Tensor]
-        Output dict including the following items:
-            * feat_gen (Tensor): Output sequence of features (T_feats, odim).
-            * prob (Tensor): Output sequence of stop probabilities (T_feats,).
-            * att_w (Tensor): Attention weights (T_feats, T).
+        Args:
+            text (Tensor(int64)): Input sequence of characters (T_text,).
+            speech (Optional[Tensor]): Feature sequence to extract style (N, idim).
+            spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,).
+            spk_id (Optional[Tensor]): Speaker ID (1,).
+            lang_id (Optional[Tensor]): Language ID (1,).
+            threshold (float): Threshold in inference.
+            minlenratio (float): Minimum length ratio in inference.
+            maxlenratio (float): Maximum length ratio in inference.
+            use_att_constraint (bool): Whether to apply attention constraint.
+            backward_window (int): Backward window in attention constraint.
+            forward_window (int): Forward window in attention constraint.
+            use_teacher_forcing (bool): Whether to use teacher forcing.
+
+        Returns:
+            Dict[str, Tensor]
+            Output dict including the following items:
+                * feat_gen (Tensor): Output sequence of features (T_feats, odim).
+                * prob (Tensor): Output sequence of stop probabilities (T_feats,).
+                * att_w (Tensor): Attention weights (T_feats, T).

        """
        x = text
@@ -458,18 +402,13 @@ class Tacotron2(nn.Layer):
                                  spk_emb: paddle.Tensor) -> paddle.Tensor:
        """Integrate speaker embedding with hidden states.

-        Parameters
-        ----------
-         hs : Tensor
-            Batch of hidden state sequences (B, Tmax, eunits).
-         spk_emb : Tensor
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-         Tensor
-            Batch of integrated hidden state sequences (B, Tmax, eunits) if
-            integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).
+        Args:
+            hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits).
+            spk_emb (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if
+                integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).

        """
        if self.spk_embed_integration_type == "add":

--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -48,127 +48,67 @@ class TransformerTTS(nn.Layer):
    .. _`Neural Speech Synthesis with Transformer Network`:
        https://arxiv.org/pdf/1809.08895.pdf

-    Parameters
-    ----------
-    idim : int
-        Dimension of the inputs.
-    odim : int
-        Dimension of the outputs.
-    embed_dim : int, optional
-        Dimension of character embedding.
-    eprenet_conv_layers : int, optional
-        Number of encoder prenet convolution layers.
-    eprenet_conv_chans : int, optional
-        Number of encoder prenet convolution channels.
-    eprenet_conv_filts : int, optional
-        Filter size of encoder prenet convolution.
-    dprenet_layers : int, optional
-        Number of decoder prenet layers.
-    dprenet_units : int, optional
-        Number of decoder prenet hidden units.
-    elayers : int, optional
-        Number of encoder layers.
-    eunits : int, optional
-        Number of encoder hidden units.
-    adim : int, optional
-        Number of attention transformation dimensions.
-    aheads : int, optional
-        Number of heads for multi head attention.
-    dlayers : int, optional
-        Number of decoder layers.
-    dunits : int, optional
-        Number of decoder hidden units.
-    postnet_layers : int, optional
-        Number of postnet layers.
-    postnet_chans : int, optional
-        Number of postnet channels.
-    postnet_filts : int, optional
-        Filter size of postnet.
-    use_scaled_pos_enc : pool, optional
-        Whether to use trainable scaled positional encoding.
-    use_batch_norm : bool, optional
-        Whether to use batch normalization in encoder prenet.
-    encoder_normalize_before : bool, optional
-        Whether to perform layer normalization before encoder block.
-    decoder_normalize_before : bool, optional
-        Whether to perform layer normalization before decoder block.
-    encoder_concat_after : bool, optional
-        Whether to concatenate attention layer's input and output in encoder.
-    decoder_concat_after : bool, optional
-        Whether to concatenate attention layer's input and output in decoder.
-    positionwise_layer_type : str, optional
-        Position-wise operation type.
-    positionwise_conv_kernel_size : int, optional
-        Kernel size in position wise conv 1d.
-    reduction_factor : int, optional
-        Reduction factor.
-    spk_embed_dim : int, optional
-        Number of speaker embedding dimenstions.
-    spk_embed_integration_type : str, optional
-        How to integrate speaker embedding.
-    use_gst : str, optional
-        Whether to use global style token.
-    gst_tokens : int, optional
-        The number of GST embeddings.
-    gst_heads : int, optional
-        The number of heads in GST multihead attention.
-    gst_conv_layers : int, optional
-        The number of conv layers in GST.
-    gst_conv_chans_list : Sequence[int], optional
-            List of the number of channels of conv layers in GST.
-    gst_conv_kernel_size : int, optional
-        Kernal size of conv layers in GST.
-    gst_conv_stride : int, optional
-        Stride size of conv layers in GST.
-    gst_gru_layers : int, optional
-        The number of GRU layers in GST.
-    gst_gru_units : int, optional
-        The number of GRU units in GST.
-    transformer_lr : float, optional
-        Initial value of learning rate.
-    transformer_warmup_steps : int, optional
-        Optimizer warmup steps.
-    transformer_enc_dropout_rate : float, optional
-        Dropout rate in encoder except attention and positional encoding.
-    transformer_enc_positional_dropout_rate : float, optional
-        Dropout rate after encoder positional encoding.
-    transformer_enc_attn_dropout_rate : float, optional
-        Dropout rate in encoder self-attention module.
-    transformer_dec_dropout_rate : float, optional
-        Dropout rate in decoder except attention & positional encoding.
-    transformer_dec_positional_dropout_rate : float, optional
-        Dropout rate after decoder positional encoding.
-    transformer_dec_attn_dropout_rate : float, optional
-        Dropout rate in deocoder self-attention module.
-    transformer_enc_dec_attn_dropout_rate : float, optional
-        Dropout rate in encoder-deocoder attention module.
-    init_type : str, optional
-        How to initialize transformer parameters.
-    init_enc_alpha : float, optional
-        Initial value of alpha in scaled pos encoding of the encoder.
-    init_dec_alpha : float, optional
-        Initial value of alpha in scaled pos encoding of the decoder.
-    eprenet_dropout_rate : float, optional
-        Dropout rate in encoder prenet.
-    dprenet_dropout_rate : float, optional
-        Dropout rate in decoder prenet.
-    postnet_dropout_rate : float, optional
-        Dropout rate in postnet.
-    use_masking : bool, optional
-        Whether to apply masking for padded part in loss calculation.
-    use_weighted_masking : bool, optional
-        Whether to apply weighted masking in loss calculation.
-    bce_pos_weight : float, optional
-        Positive sample weight in bce calculation (only for use_masking=true).
-    loss_type : str, optional
-        How to calculate loss.
-    use_guided_attn_loss : bool, optional
-        Whether to use guided attention loss.
-    num_heads_applied_guided_attn : int, optional
-        Number of heads in each layer to apply guided attention loss.
-    num_layers_applied_guided_attn : int, optional
-        Number of layers to apply guided attention loss.
-        List of module names to apply guided attention loss.
+    Args:
+        idim (int): Dimension of the inputs.
+        odim (int): Dimension of the outputs.
+        embed_dim (int, optional): Dimension of character embedding.
+        eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers.
+        eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels.
+        eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution.
+        dprenet_layers (int, optional): Number of decoder prenet layers.
+        dprenet_units (int, optional): Number of decoder prenet hidden units.
+        elayers (int, optional): Number of encoder layers.
+        eunits (int, optional): Number of encoder hidden units.
+        adim (int, optional): Number of attention transformation dimensions.
+        aheads (int, optional): Number of heads for multi head attention.
+        dlayers (int, optional): Number of decoder layers.
+        dunits (int, optional): Number of decoder hidden units.
+        postnet_layers (int, optional): Number of postnet layers.
+        postnet_chans (int, optional): Number of postnet channels.
+        postnet_filts (int, optional): Filter size of postnet.
+        use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding.
+        use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet.
+        encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block.
+        decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block.
+        encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder.
+        decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder.
+        positionwise_layer_type (str, optional): Position-wise operation type.
+        positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d.
+        reduction_factor (int, optional): Reduction factor.
+        spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
+        spk_embed_integration_type (str, optional): How to integrate speaker embedding.
+        use_gst (str, optional): Whether to use global style token.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        gst_conv_layers (int, optional): The number of conv layers in GST.
+        gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST.
+        gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
+        gst_conv_stride (int, optional): Stride size of conv layers in GST.
+        gst_gru_layers (int, optional): The number of GRU layers in GST.
+        gst_gru_units (int, optional): The number of GRU units in GST.
+        transformer_lr (float, optional): Initial value of learning rate.
+        transformer_warmup_steps (int, optional): Optimizer warmup steps.
+        transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding.
+        transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding.
+        transformer_enc_attn_dropout_rate （float, optional): Dropout rate in encoder self-attention module.
+        transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding.
+        transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding.
+        transformer_dec_attn_dropout_rate （float, optional): Dropout rate in deocoder self-attention module.
+        transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module.
+        init_type (str, optional): How to initialize transformer parameters.
+        init_enc_alpha （float, optional）: Initial value of alpha in scaled pos encoding of the encoder.
+        init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder.
+        eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet.
+        dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet.
+        postnet_dropout_rate (float, optional): Dropout rate in postnet.
+        use_masking (bool, optional): Whether to apply masking for padded part in loss calculation.
+        use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation.
+        bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true).
+        loss_type (str, optional): How to calculate loss.
+        use_guided_attn_loss (bool, optional): Whether to use guided attention loss.
+        num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss.
+        num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss.
+            List of module names to apply guided attention loss.
    """

    def __init__(
@@ -398,25 +338,16 @@ class TransformerTTS(nn.Layer):
    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
        """Calculate forward propagation.

-        Parameters
-        ----------
-        text : Tensor(int64)
-            Batch of padded character ids (B, Tmax).
-        text_lengths : Tensor(int64)
-            Batch of lengths of each input batch (B,).
-        speech : Tensor
-            Batch of padded target features (B, Lmax, odim).
-        speech_lengths : Tensor(int64)
-            Batch of the lengths of each target (B,).
-        spk_emb : Tensor, optional
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-        Tensor
-            Loss scalar value.
-        Dict
-            Statistics to be monitored.
+        Args:
+            text(Tensor(int64)): Batch of padded character ids (B, Tmax).
+            text_lengths(Tensor(int64)): Batch of lengths of each input batch (B,).
+            speech(Tensor): Batch of padded target features (B, Lmax, odim).
+            speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
+            spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.

        """
        # input of embedding must be int64
@@ -525,31 +456,19 @@ class TransformerTTS(nn.Layer):
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Generate the sequence of features given the sequences of characters.

-        Parameters
-        ----------
-        text : Tensor(int64)
-            Input sequence of characters (T,).
-        speech : Tensor, optional
-            Feature sequence to extract style (N, idim).
-        spk_emb : Tensor, optional
-            Speaker embedding vector (spk_embed_dim,).
-        threshold : float, optional
-            Threshold in inference.
-        minlenratio : float, optional
-            Minimum length ratio in inference.
-        maxlenratio : float, optional
-            Maximum length ratio in inference.
-        use_teacher_forcing : bool, optional
-            Whether to use teacher forcing.
-
-        Returns
-        ----------
-        Tensor
-            Output sequence of features (L, odim).
-        Tensor
-            Output sequence of stop probabilities (L,).
-        Tensor
-            Encoder-decoder (source) attention weights (#layers, #heads, L, T).
+        Args:
+            text(Tensor(int64)): Input sequence of characters (T,).
+            speech(Tensor, optional): Feature sequence to extract style (N, idim).
+            spk_emb(Tensor, optional): Speaker embedding vector (spk_embed_dim,).
+            threshold(float, optional): Threshold in inference.
+            minlenratio(float, optional): Minimum length ratio in inference.
+            maxlenratio(float, optional): Maximum length ratio in inference.
+            use_teacher_forcing(bool, optional): Whether to use teacher forcing.
+
+        Returns:
+            Tensor: Output sequence of features (L, odim).
+            Tensor: Output sequence of stop probabilities (L,).
+            Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T).

        """
        # input of embedding must be int64
@@ -671,23 +590,17 @@ class TransformerTTS(nn.Layer):
    def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
        """Make masks for self-attention.

-        Parameters
-        ----------
-        ilens : Tensor
-            Batch of lengths (B,).
+        Args:
+            ilens(Tensor): Batch of lengths (B,).

-        Returns
-        -------
-        Tensor
-            Mask tensor for self-attention.
-            dtype=paddle.bool
+        Returns:
+            Tensor: Mask tensor for self-attention. dtype=paddle.bool

-        Examples
-        -------
-        >>> ilens = [5, 3]
-        >>> self._source_mask(ilens)
-        tensor([[[1, 1, 1, 1, 1],
-                    [1, 1, 1, 0, 0]]]) bool
+        Examples:
+            >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
+                        [1, 1, 1, 0, 0]]]) bool

        """
        x_masks = make_non_pad_mask(ilens)
@@ -696,30 +609,25 @@ class TransformerTTS(nn.Layer):
    def _target_mask(self, olens: paddle.Tensor) -> paddle.Tensor:
        """Make masks for masked self-attention.

-        Parameters
-        ----------
-            olens : LongTensor
-                Batch of lengths (B,).
-
-        Returns
-        ----------
-        Tensor
-            Mask tensor for masked self-attention.
-
-        Examples
-        ----------
-        >>> olens = [5, 3]
-        >>> self._target_mask(olens)
-        tensor([[[1, 0, 0, 0, 0],
-                    [1, 1, 0, 0, 0],
-                    [1, 1, 1, 0, 0],
-                    [1, 1, 1, 1, 0],
-                    [1, 1, 1, 1, 1]],
-                [[1, 0, 0, 0, 0],
-                    [1, 1, 0, 0, 0],
-                    [1, 1, 1, 0, 0],
-                    [1, 1, 1, 0, 0],
-                    [1, 1, 1, 0, 0]]], dtype=paddle.uint8)
+        Args:
+            olens (Tensor(int64)): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for masked self-attention.
+
+        Examples:
+            >>> olens = [5, 3]
+            >>> self._target_mask(olens)
+            tensor([[[1, 0, 0, 0, 0],
+                        [1, 1, 0, 0, 0],
+                        [1, 1, 1, 0, 0],
+                        [1, 1, 1, 1, 0],
+                        [1, 1, 1, 1, 1]],
+                    [[1, 0, 0, 0, 0],
+                        [1, 1, 0, 0, 0],
+                        [1, 1, 1, 0, 0],
+                        [1, 1, 1, 0, 0],
+                        [1, 1, 1, 0, 0]]], dtype=paddle.uint8)

        """
        y_masks = make_non_pad_mask(olens)
@@ -731,17 +639,12 @@ class TransformerTTS(nn.Layer):
                                  spk_emb: paddle.Tensor) -> paddle.Tensor:
        """Integrate speaker embedding with hidden states.

-        Parameters
-        ----------
-        hs : Tensor
-            Batch of hidden state sequences (B, Tmax, adim).
-        spk_emb : Tensor
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-        Tensor
-            Batch of integrated hidden state sequences (B, Tmax, adim).
+        Args:
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).

        """
        if self.spk_embed_integration_type == "add":

--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
--- a/paddlespeech/t2s/models/wavernn/wavernn.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn.py
@@ -67,14 +67,10 @@ class MelResNet(nn.Layer):

    def forward(self, x):
        '''
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_dims, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, res_out_dims, T).
+        Args:
+            x (Tensor): Input tensor (B, in_dims, T).
+        Returns:
+            Tensor: Output tensor (B, res_out_dims, T).
        '''

        x = self.conv_in(x)
@@ -121,16 +117,11 @@ class UpsampleNetwork(nn.Layer):

    def forward(self, m):
        '''
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, C_aux, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
-        Tensor
-            Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
+        Args:
+            c (Tensor): Input tensor (B, C_aux, T).
+        Returns:
+            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
+            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
        '''
        # aux: [B, C_aux, T] 
        # -> [B, res_out_dims, T - 2 * aux_context_window]
@@ -172,32 +163,20 @@ class WaveRNN(nn.Layer):
            mode='RAW',
            init_type: str="xavier_uniform", ):
        '''
-        Parameters
-        ----------
-        rnn_dims : int, optional
-            Hidden dims of RNN Layers.
-        fc_dims : int, optional
-             Dims of FC Layers.
-        bits : int, optional
-            bit depth of signal.
-        aux_context_window : int, optional
-            The context window size of the first convolution applied to the 
-            auxiliary input, by default 2
-        upsample_scales : List[int], optional
-            Upsample scales of the upsample network.
-        aux_channels : int, optional
-            Auxiliary channel of the residual blocks.
-        compute_dims : int, optional
-            Dims of Conv1D in MelResNet.
-        res_out_dims : int, optional
-            Dims of output in MelResNet.
-        res_blocks : int, optional
-            Number of residual blocks.
-        mode : str, optional
-            Output mode of the WaveRNN vocoder. `MOL` for Mixture of Logistic Distribution,
-            and `RAW` for quantized bits as the model's output.
-        init_type : str
-            How to initialize parameters.
+        Args:
+            rnn_dims (int, optional): Hidden dims of RNN Layers.
+            fc_dims (int, optional): Dims of FC Layers.
+            bits (int, optional): bit depth of signal.
+            aux_context_window (int, optional): The context window size of the first convolution applied to the 
+                auxiliary input, by default 2
+            upsample_scales (List[int], optional): Upsample scales of the upsample network.
+            aux_channels (int, optional): Auxiliary channel of the residual blocks.
+            compute_dims (int, optional): Dims of Conv1D in MelResNet.
+            res_out_dims (int, optional): Dims of output in MelResNet.
+            res_blocks (int, optional): Number of residual blocks.
+            mode (str, optional): Output mode of the WaveRNN vocoder. 
+                `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output.
+            init_type (str): How to initialize parameters.
        '''
        super().__init__()
        self.mode = mode
@@ -245,18 +224,13 @@ class WaveRNN(nn.Layer):

    def forward(self, x, c):
        '''
-        Parameters
-        ----------
-        x : Tensor
-            wav sequence, [B, T]
-        c : Tensor
-            mel spectrogram [B, C_aux, T']
-        
-        T = (T' - 2 * aux_context_window ) * hop_length
-        Returns
-        ----------
-        Tensor
-            [B, T, n_classes]
+        Args:
+            x (Tensor): wav sequence, [B, T]
+            c (Tensor): mel spectrogram [B, C_aux, T']
+
+            T = (T' - 2 * aux_context_window ) * hop_length
+        Returns:
+            Tensor: [B, T, n_classes]
        '''
        # Although we `_flatten_parameters()` on init, when using DataParallel
        # the model gets replicated, making it no longer guaranteed that the
@@ -304,22 +278,14 @@ class WaveRNN(nn.Layer):
                 mu_law: bool=True,
                 gen_display: bool=False):
        """
-        Parameters
-        ----------
-        c : Tensor
-            input mels, (T', C_aux)
-        batched : bool
-            generate in batch or not
-        target : int
-            target number of samples to be generated in each batch entry
-        overlap : int
-            number of samples for crossfading between batches
-        mu_law : bool
-            use mu law or not
-        Returns
-        ----------
-        wav sequence
-            Output (T' * prod(upsample_scales), out_channels, C_out).
+        Args:
+            c(Tensor): input mels, (T', C_aux)
+            batched(bool): generate in batch or not
+            target(int): target number of samples to be generated in each batch entry
+            overlap(int): number of samples for crossfading between batches
+            mu_law(bool)
+        Returns: 
+            wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
        """

        self.eval()
@@ -434,16 +400,13 @@ class WaveRNN(nn.Layer):

    def pad_tensor(self, x, pad, side='both'):
        '''
-        Parameters
-        ----------
-        x : Tensor
-            mel, [1, n_frames, 80]
-        pad : int
-        side : str 
-            'both', 'before' or 'after'
-        Returns
-        ----------
-        Tensor
+        Args:
+            x(Tensor): mel, [1, n_frames, 80]
+            pad(int): 
+            side(str, optional):  (Default value = 'both')
+
+        Returns:
+            Tensor
        '''
        b, t, _ = paddle.shape(x)
        # for dygraph to static graph
@@ -461,38 +424,29 @@ class WaveRNN(nn.Layer):
        Fold the tensor with overlap for quick batched inference.
        Overlap will be used for crossfading in xfade_and_unfold()

-        Parameters
-        ----------
-        x : Tensor
-            Upsampled conditioning features. mels or aux
-            shape=(1, T, features)
-            mels: [1, T, 80]
-            aux: [1, T, 128]
-        target : int
-            Target timesteps for each index of batch
-        overlap : int
-            Timesteps for both xfade and rnn warmup
-            overlap = hop_length * 2
-
-        Returns
-        ----------
-        Tensor 
-            shape=(num_folds, target + 2 * overlap, features)
-            num_flods = (time_seq - overlap) // (target + overlap)
-            mel: [num_folds, target + 2 * overlap, 80]
-            aux: [num_folds, target + 2 * overlap, 128]
-
-        Details
-        ----------
-        x = [[h1, h2, ... hn]]
-
-        Where each h is a vector of conditioning features
-
-        Eg: target=2, overlap=1 with x.size(1)=10
-
-        folded = [[h1, h2, h3, h4],
-                  [h4, h5, h6, h7],
-                  [h7, h8, h9, h10]]
+        Args:
+            x(Tensor): Upsampled conditioning features. mels or aux
+                shape=(1, T, features)
+                mels: [1, T, 80]
+                aux: [1, T, 128]
+            target(int): Target timesteps for each index of batch
+            overlap(int): Timesteps for both xfade and rnn warmup
+
+        Returns:
+            Tensor: 
+                shape=(num_folds, target + 2 * overlap, features)
+                num_flods = (time_seq - overlap) // (target + overlap)
+                mel: [num_folds, target + 2 * overlap, 80]
+                aux: [num_folds, target + 2 * overlap, 128]
+
+        Details:
+            x = [[h1, h2, ... hn]]
+            Where each h is a vector of conditioning features
+            Eg: target=2, overlap=1 with x.size(1)=10
+
+            folded = [[h1, h2, h3, h4],
+                    [h4, h5, h6, h7],
+                    [h7, h8, h9, h10]]
        '''

        _, total_len, features = paddle.shape(x)
@@ -520,37 +474,33 @@ class WaveRNN(nn.Layer):
    def xfade_and_unfold(self, y, target: int=12000, overlap: int=600):
        ''' Applies a crossfade and unfolds into a 1d array.

-        Parameters
-        ----------
-        y : Tensor
-            Batched sequences of audio samples
-            shape=(num_folds, target + 2 * overlap)
-            dtype=paddle.float32
-        overlap : int
-            Timesteps for both xfade and rnn warmup
-
-        Returns
-        ----------
-        Tensor
-            audio samples in a 1d array
-            shape=(total_len)
-            dtype=paddle.float32
-
-        Details
-        ----------
-        y = [[seq1],
-            [seq2],
-            [seq3]]
-
-        Apply a gain envelope at both ends of the sequences
-
-        y = [[seq1_in, seq1_target, seq1_out],
-            [seq2_in, seq2_target, seq2_out],
-            [seq3_in, seq3_target, seq3_out]]
-
-        Stagger and add up the groups of samples:
-
-        [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
+        Args:
+            y (Tensor): 
+                Batched sequences of audio samples
+                shape=(num_folds, target + 2 * overlap)
+                dtype=paddle.float32
+            overlap (int): Timesteps for both xfade and rnn warmup
+
+        Returns:
+            Tensor
+                audio samples in a 1d array
+                shape=(total_len)
+                dtype=paddle.float32
+
+        Details:
+            y = [[seq1],
+                [seq2],
+                [seq3]]
+
+            Apply a gain envelope at both ends of the sequences
+
+            y = [[seq1_in, seq1_target, seq1_out],
+                [seq2_in, seq2_target, seq2_out],
+                [seq3_in, seq3_target, seq3_out]]
+
+            Stagger and add up the groups of samples:
+
+            [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]

        '''
        # num_folds = (total_len - overlap) // (target + overlap)

--- a/paddlespeech/t2s/modules/causal_conv.py
+++ b/paddlespeech/t2s/modules/causal_conv.py
@@ -41,14 +41,10 @@ class CausalConv1D(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T).
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+        Returns: 
+            Tensor: Output tensor (B, out_channels, T).
        """
        return self.conv(self.pad(x))[:, :, :x.shape[2]]

@@ -70,13 +66,9 @@ class CausalConv1DTranspose(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_channels, T_in).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T_out).
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T_in).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T_out).
        """
        return self.deconv(x)[:, :, :-self.stride]
--- a/paddlespeech/t2s/modules/conformer/convolution.py
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@@ -18,12 +18,10 @@ from paddle import nn

 class ConvolutionModule(nn.Layer):
    """ConvolutionModule in Conformer model.
-    Parameters
-    ----------
-    channels : int
-        The number of channels of conv layers.
-    kernel_size : int
-        Kernerl size of conv layers.
+
+    Args:
+        channels (int): The number of channels of conv layers.
+        kernel_size (int): Kernerl size of conv layers.
    """

    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
@@ -59,14 +57,11 @@ class ConvolutionModule(nn.Layer):

    def forward(self, x):
        """Compute convolution module.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (#batch, time, channels).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, channels).
+
+        Args:
+            x (Tensor): Input tensor (#batch, time, channels).
+        Returns:
+            Tensor: Output tensor (#batch, time, channels).
        """
        # exchange the temporal dimension and the feature dimension
        x = x.transpose([0, 2, 1])

--- a/paddlespeech/t2s/modules/conformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@@ -21,38 +21,29 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm

 class EncoderLayer(nn.Layer):
    """Encoder layer module.
-    Parameters
-    ----------
-    size : int
-        Input dimension.
-    self_attn : nn.Layer
-        Self-attention module instance.
-        `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
-        can be used as the argument.
-    feed_forward : nn.Layer
-        Feed-forward module instance.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
-        can be used as the argument.
-    feed_forward_macaron : nn.Layer
-        Additional feed-forward module instance.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
-        can be used as the argument.
-    conv_module : nn.Layer
-        Convolution module instance.
-        `ConvlutionModule` instance can be used as the argument.
-    dropout_rate : float
-        Dropout rate.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    stochastic_depth_rate : float
-        Proability to skip this layer.
-        During training, the layer may skip residual computation and return input
-        as-is with given probability.
+    
+    Args:
+        size (int): Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+            can be used as the argument.
+        feed_forward (nn.Layer): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        feed_forward_macaron (nn.Layer): Additional feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        conv_module (nn.Layer): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        stochastic_depth_rate (float): Proability to skip this layer.
+            During training, the layer may skip residual computation and return input
+            as-is with given probability.
    """

    def __init__(
@@ -93,22 +84,17 @@ class EncoderLayer(nn.Layer):

    def forward(self, x_input, mask, cache=None):
        """Compute encoded features.
-        Parameters
-        ----------
-        x_input : Union[Tuple, paddle.Tensor]
-            Input tensor w/ or w/o pos emb.
-            - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
-            - w/o pos emb: Tensor (#batch, time, size).
-        mask : paddle.Tensor
-            Mask tensor for the input (#batch, time).
-        cache paddle.Tensor
-            Cache tensor of the input (#batch, time - 1, size).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, size).
-        paddle.Tensor
-            Mask tensor (#batch, time).
+
+        Args:
+            x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb.
+                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+                - w/o pos emb: Tensor (#batch, time, size).
+            mask(Tensor): Mask tensor for the input (#batch, time).
+            cache (Tensor): 
+
+        Returns:
+            Tensor: Output tensor (#batch, time, size).
+            Tensor: Mask tensor (#batch, time).
        """
        if isinstance(x_input, tuple):
            x, pos_emb = x_input[0], x_input[1]

--- a/paddlespeech/t2s/modules/conv.py
+++ b/paddlespeech/t2s/modules/conv.py
@@ -40,36 +40,29 @@ class Conv1dCell(nn.Conv1D):
    2. padding must be a causal padding (recpetive_field - 1, 0).
    Thus, these arguments are removed from the ``__init__`` method of this
    class.
-    
-    Parameters
-    ----------
-    in_channels: int
-        The feature size of the input.
-    out_channels: int
-        The feature size of the output.
-    kernel_size: int or Tuple[int]
-        The size of the kernel.
-    dilation: int or Tuple[int]
-        The dilation of the convolution, by default 1
-    weight_attr: ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the convolution kernel, by default None.
-    bias_attr: ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the bias. If ``False``, this layer does not
-        have a bias, by default None.
-        
-    Examples
-    --------
-    >>> cell = Conv1dCell(3, 4, kernel_size=5)
-    >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
-    >>> outputs = []
-    >>> cell.eval()
-    >>> cell.start_sequence()
-    >>> for xt in inputs:
-    >>>     outputs.append(cell.add_input(xt))
-    >>> len(outputs))
-    16
-    >>> outputs[0].shape
-    [4, 4]
+
+    Args:
+        in_channels (int): The feature size of the input.
+        out_channels (int): The feature size of the output.
+        kernel_size (int or Tuple[int]): The size of the kernel.
+        dilation (int or Tuple[int]): The dilation of the convolution, by default 1
+        weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel, 
+            by default None.
+        bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias. 
+            If ``False``, this layer does not have a bias, by default None.
+            
+    Examples: 
+        >>> cell = Conv1dCell(3, 4, kernel_size=5)
+        >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
+        >>> outputs = []
+        >>> cell.eval()
+        >>> cell.start_sequence()
+        >>> for xt in inputs:
+        >>>     outputs.append(cell.add_input(xt))
+        >>> len(outputs))
+        16
+        >>> outputs[0].shape
+        [4, 4]
    """

    def __init__(self,
@@ -103,15 +96,13 @@ class Conv1dCell(nn.Conv1D):
    def start_sequence(self):
        """Prepare the layer for a series of incremental forward.
        
-        Warnings
-        ---------
-        This method should be called before a sequence of calls to
-        ``add_input``.
+        Warnings:
+            This method should be called before a sequence of calls to
+            ``add_input``.

-        Raises
-        ------
-        Exception
-            If this method is called when the layer is in training mode.
+        Raises:
+            Exception
+                If this method is called when the layer is in training mode.
        """
        if self.training:
            raise Exception("only use start_sequence in evaluation")
@@ -130,10 +121,9 @@ class Conv1dCell(nn.Conv1D):
    def initialize_buffer(self, x_t):
        """Initialize the buffer for the step input.

-        Parameters
-        ----------
-        x_t : Tensor [shape=(batch_size, in_channels)]
-            The step input.
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+            
        """
        batch_size, _ = x_t.shape
        self._buffer = paddle.zeros(
@@ -143,26 +133,22 @@ class Conv1dCell(nn.Conv1D):
    def update_buffer(self, x_t):
        """Shift the buffer by one step.

-        Parameters
-        ----------
-        x_t : Tensor [shape=(batch_size, in_channels)]
-            The step input.
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+            
        """
        self._buffer = paddle.concat(
            [self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1)

    def add_input(self, x_t):
        """Add step input and compute step output.
-        
-        Parameters
-        -----------
-        x_t : Tensor [shape=(batch_size, in_channels)]
-            The step input.
-            
-        Returns
-        -------
-        y_t :Tensor [shape=(batch_size, out_channels)]
-            The step output.
+
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+          
+        Returns: 
+            y_t (Tensor): The step output. shape=(batch_size, out_channels)
+
        """
        batch_size = x_t.shape[0]
        if self.receptive_field > 1:
@@ -186,33 +172,26 @@ class Conv1dCell(nn.Conv1D):
 class Conv1dBatchNorm(nn.Layer):
    """A Conv1D Layer followed by a BatchNorm1D.

-    Parameters
-    ----------
-    in_channels : int
-        The feature size of the input.
-    out_channels : int
-        The feature size of the output.
-    kernel_size : int
-        The size of the convolution kernel.
-    stride : int, optional
-        The stride of the convolution, by default 1.
-    padding : int, str or Tuple[int], optional
-        The padding of the convolution.
-        If int, a symmetrical padding is applied before convolution;
-        If str, it should be "same" or "valid";
-        If Tuple[int], its length should be 2, meaning
-        ``(pad_before, pad_after)``, by default 0.
-    weight_attr : ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the convolution kernel, by default None.
-    bias_attr : ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the bias of the convolution, by default
-        None.
-    data_format : str ["NCL" or "NLC"], optional
-        The data layout of the input, by default "NCL"
-    momentum : float, optional
-        The momentum of the BatchNorm1D layer, by default 0.9
-    epsilon : [type], optional
-        The epsilon of the BatchNorm1D layer, by default 1e-05
+    Args:
+        in_channels (int): The feature size of the input.
+        out_channels (int): The feature size of the output.
+        kernel_size (int): The size of the convolution kernel.
+        stride (int, optional): The stride of the convolution, by default 1.
+        padding (int, str or Tuple[int], optional):
+            The padding of the convolution.
+            If int, a symmetrical padding is applied before convolution;
+            If str, it should be "same" or "valid";
+            If Tuple[int], its length should be 2, meaning
+            ``(pad_before, pad_after)``, by default 0.
+        weight_attr (ParamAttr, Initializer, str or bool, optional):
+            The parameter attribute of the convolution kernel,
+            by default None.
+        bias_attr (ParamAttr, Initializer, str or bool, optional):
+            The parameter attribute of the bias of the convolution,
+            by defaultNone.
+        data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL"
+        momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9
+        epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05
    """

    def __init__(self,
@@ -244,16 +223,15 @@ class Conv1dBatchNorm(nn.Layer):

    def forward(self, x):
        """Forward pass of the Conv1dBatchNorm layer.
-
-        Parameters
-        ----------
-        x : Tensor [shape=(B, C_in, T_in) or (B, T_in, C_in)]
-            The input tensor. Its data layout depends on ``data_format``.
-
-        Returns
-        -------
-        Tensor [shape=(B, C_out, T_out) or (B, T_out, C_out)]
-            The output tensor. 
+        
+        Args:
+            x (Tensor): The input tensor. Its data layout depends on ``data_format``. 
+            shape=(B, C_in, T_in) or (B, T_in, C_in)
+    
+        Returns:
+            Tensor: The output tensor. 
+                shape=(B, C_out, T_out) or (B, T_out, C_out)
+                
        """
        x = self.conv(x)
        x = self.bn(x)

--- a/paddlespeech/t2s/modules/geometry.py
+++ b/paddlespeech/t2s/modules/geometry.py
@@ -17,24 +17,18 @@ import paddle

 def shuffle_dim(x, axis, perm=None):
    """Permute input tensor along aixs given the permutation or randomly.
+    
+    Args:
+        x (Tensor): The input tensor.
+        axis (int): The axis to shuffle.
+        perm (List[int], ndarray, optional): 
+            The order to reorder the tensor along the ``axis``-th dimension.
+            It is a permutation of ``[0, d)``, where d is the size of the
+            ``axis``-th dimension of the input tensor. If not provided,
+            a random permutation is used. Defaults to None.

-    Parameters
-    ----------
-    x : Tensor
-        The input tensor.
-    axis : int
-        The axis to shuffle.
-    perm : List[int], ndarray, optional
-        The order to reorder the tensor along the ``axis``-th dimension.
-        
-        It is a permutation of ``[0, d)``, where d is the size of the
-        ``axis``-th dimension of the input tensor. If not provided,
-        a random permutation is used. Defaults to None.
-
-    Returns
-    ---------
-    Tensor
-        The shuffled tensor, which has the same shape as x does.
+    Returns:
+        Tensor: The shuffled tensor, which has the same shape as x does.
    """
    size = x.shape[axis]
    if perm is not None and len(perm) != size:

--- a/paddlespeech/t2s/modules/layer_norm.py
+++ b/paddlespeech/t2s/modules/layer_norm.py
@@ -18,13 +18,9 @@ from paddle import nn

 class LayerNorm(nn.LayerNorm):
    """Layer normalization module.
-
-    Parameters
-    ----------
-    nout : int
-        Output dim size.
-    dim : int
-        Dimension to be normalized.
+    Args:
+        nout (int): Output dim size.
+        dim (int): Dimension to be normalized.
    """

    def __init__(self, nout, dim=-1):
@@ -35,15 +31,11 @@ class LayerNorm(nn.LayerNorm):
    def forward(self, x):
        """Apply layer normalization.

-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor.
+        Args:
+            x (Tensor):Input tensor.

-        Returns
-        ----------
-        paddle.Tensor
-            Normalized tensor.
+        Returns: 
+            Tensor: Normalized tensor.
        """

        if self.dim == -1:

--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -20,27 +20,21 @@ from typeguard import check_argument_types
 def pad_list(xs, pad_value):
    """Perform padding for the list of tensors.

-    Parameters
-    ----------
-    xs : List[Tensor]
-        List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
-    pad_value : float)
-        Value for padding.
-
-    Returns
-    ----------
-    Tensor
-        Padded tensor (B, Tmax, `*`).
-
-    Examples
-    ----------
-    >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
-    >>> x
-    [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
-    >>> pad_list(x, 0)
-    tensor([[1., 1., 1., 1.],
-            [1., 1., 0., 0.],
-            [1., 0., 0., 0.]])
+    Args:
+        xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+
+    Examples:
+        >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
    """
    n_batch = len(xs)
    max_len = max(x.shape[0] for x in xs)
@@ -55,25 +49,20 @@ def pad_list(xs, pad_value):
 def make_pad_mask(lengths, length_dim=-1):
    """Make mask tensor containing indices of padded part.

-    Parameters
-    ----------
-    lengths : LongTensor
-            Batch of lengths (B,).
-
-    Returns
-    ----------
-    Tensor(bool)
-        Mask tensor containing indices of padded part bool.
-
-    Examples
-    ----------
-    With only lengths.
-
-    >>> lengths = [5, 3, 2]
-    >>> make_non_pad_mask(lengths)
-    masks = [[0, 0, 0, 0 ,0],
-                [0, 0, 0, 1, 1],
-                [0, 0, 1, 1, 1]]
+    Args:
+        lengths (Tensor(int64)): Batch of lengths (B,).
+
+    Returns: 
+        Tensor(bool): Mask tensor containing indices of padded part bool.
+
+    Examples:
+        With only lengths.
+
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                    [0, 0, 0, 1, 1],
+                    [0, 0, 1, 1, 1]]
    """
    if length_dim == 0:
        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
@@ -91,31 +80,24 @@ def make_pad_mask(lengths, length_dim=-1):
 def make_non_pad_mask(lengths, length_dim=-1):
    """Make mask tensor containing indices of non-padded part.

-    Parameters
-    ----------
-    lengths : LongTensor or List
-            Batch of lengths (B,).
-    xs : Tensor, optional
-        The reference tensor.
-        If set, masks will be the same shape as this tensor.
-    length_dim : int, optional
-        Dimension indicator of the above tensor.
-        See the example.
-
-    Returns
-    ----------
-    Tensor(bool)
-        mask tensor containing indices of padded part bool.
-
-    Examples
-    ----------
-    With only lengths.
-
-    >>> lengths = [5, 3, 2]
-    >>> make_non_pad_mask(lengths)
-    masks = [[1, 1, 1, 1 ,1],
-                [1, 1, 1, 0, 0],
-                [1, 1, 0, 0, 0]]
+    Args:
+        lengths (Tensor(int64) or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+
+    Returns:
+        Tensor(bool): mask tensor containing indices of padded part bool.
+
+    Examples: 
+        With only lengths.
+
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1 ,1],
+                    [1, 1, 1, 0, 0],
+                    [1, 1, 0, 0, 0]]
    """
    return paddle.logical_not(make_pad_mask(lengths, length_dim))

@@ -127,12 +109,9 @@ def initialize(model: nn.Layer, init: str):

    Custom initialization routines can be implemented into submodules

-    Parameters
-    ----------
-    model : nn.Layer
-        Target.
-    init : str
-        Method of initialization.
+    Args:
+        model (nn.Layer): Target.
+        init (str): Method of initialization.
    """
    assert check_argument_types()


--- a/paddlespeech/t2s/modules/pqmf.py
+++ b/paddlespeech/t2s/modules/pqmf.py
@@ -24,20 +24,16 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
    """Design prototype filter for PQMF.
    This method is based on `A Kaiser window approach for the design of prototype
    filters of cosine modulated filterbanks`_.
-    Parameters
-    ----------
-    taps : int
-        The number of filter taps.
-    cutoff_ratio : float
-        Cut-off frequency ratio.
-    beta : float
-        Beta coefficient for kaiser window.
-    Returns
-    ----------
-    ndarray
-        Impluse response of prototype filter (taps + 1,).
-    .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
-        https://ieeexplore.ieee.org/abstract/document/681427
+
+    Args:
+        taps (int): The number of filter taps.
+        cutoff_ratio (float): Cut-off frequency ratio.
+        beta (float): Beta coefficient for kaiser window.
+    Returns:
+        ndarray:
+            Impluse response of prototype filter (taps + 1,).
+        .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
+            https://ieeexplore.ieee.org/abstract/document/681427
    """
    # check the arguments are valid
    assert taps % 2 == 0, "The number of taps mush be even number."
@@ -68,16 +64,12 @@ class PQMF(nn.Layer):
        """Initilize PQMF module.
        The cutoff_ratio and beta parameters are optimized for #subbands = 4.
        See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
-        Parameters
-        ----------
-        subbands : int
-            The number of subbands.
-        taps : int
-            The number of filter taps.
-        cutoff_ratio : float
-            Cut-off frequency ratio.
-        beta : float
-            Beta coefficient for kaiser window.
+
+        Args:
+            subbands (int): The number of subbands.
+            taps (int): The number of filter taps.
+            cutoff_ratio (float): Cut-off frequency ratio.
+            beta (float): Beta coefficient for kaiser window.
        """
        super().__init__()

@@ -110,28 +102,20 @@ class PQMF(nn.Layer):

    def analysis(self, x):
        """Analysis with PQMF.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, 1, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, subbands, T // subbands).
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+        Returns:
+            Tensor: Output tensor (B, subbands, T // subbands).
        """
        x = F.conv1d(self.pad_fn(x), self.analysis_filter)
        return F.conv1d(x, self.updown_filter, stride=self.subbands)

    def synthesis(self, x):
        """Synthesis with PQMF.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, subbands, T // subbands).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, 1, T).
+        Args:
+            x (Tensor): Input tensor (B, subbands, T // subbands).
+        Returns:
+            Tensor: Output tensor (B, 1, T).
        """
        x = F.conv1d_transpose(
            x, self.updown_filter * self.subbands, stride=self.subbands)

--- a/paddlespeech/t2s/modules/predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py
@@ -49,20 +49,13 @@ class DurationPredictor(nn.Layer):
                 offset=1.0):
        """Initilize duration predictor module.

-        Parameters
-        ----------
-        idim : int
-            Input dimension.
-        n_layers : int, optional
-                Number of convolutional layers.
-        n_chans : int, optional
-            Number of channels of convolutional layers.
-        kernel_size : int, optional
-            Kernel size of convolutional layers.
-        dropout_rate : float, optional
-                Dropout rate.
-        offset : float, optional
-            Offset value to avoid nan in log domain.
+        Args:
+            idim (int):Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+            offset (float, optional): Offset value to avoid nan in log domain.

        """
        super().__init__()
@@ -105,35 +98,23 @@ class DurationPredictor(nn.Layer):

    def forward(self, xs, x_masks=None):
        """Calculate forward propagation.
+        Args:
+            xs(Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)

-        Parameters
-        ----------
-        xs : Tensor
-            Batch of input sequences (B, Tmax, idim).
-        x_masks : ByteTensor, optional
-            Batch of masks indicating padded part (B, Tmax).
-
-        Returns
-        ----------
-        Tensor
-            Batch of predicted durations in log domain (B, Tmax).
+        Returns:
+            Tensor: Batch of predicted durations in log domain (B, Tmax).
        """
        return self._forward(xs, x_masks, False)

    def inference(self, xs, x_masks=None):
        """Inference duration.
+        Args:
+            xs(Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)

-        Parameters
-        ----------
-        xs : Tensor
-            Batch of input sequences (B, Tmax, idim).
-        x_masks : Tensor(bool), optional
-            Batch of masks indicating padded part (B, Tmax).
-
-        Returns
-        ----------
-        Tensor
-            Batch of predicted durations in linear domain int64 (B, Tmax).
+        Returns:
+            Tensor: Batch of predicted durations in linear domain int64 (B, Tmax).
        """
        return self._forward(xs, x_masks, True)

@@ -147,13 +128,9 @@ class DurationPredictorLoss(nn.Layer):

    def __init__(self, offset=1.0, reduction="mean"):
        """Initilize duration predictor loss module.
-
-        Parameters
-        ----------
-        offset : float, optional
-            Offset value to avoid nan in log domain.
-        reduction : str
-            Reduction type in loss calculation.
+        Args:
+            offset (float, optional): Offset value to avoid nan in log domain.
+            reduction (str): Reduction type in loss calculation.
        """
        super().__init__()
        self.criterion = nn.MSELoss(reduction=reduction)
@@ -162,21 +139,15 @@ class DurationPredictorLoss(nn.Layer):
    def forward(self, outputs, targets):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        outputs : Tensor
-            Batch of prediction durations in log domain (B, T)
-        targets : Tensor
-            Batch of groundtruth durations in linear domain (B, T)
-
-        Returns
-        ----------
-        Tensor
-            Mean squared error loss value.
-
-        Note
-        ----------
-        `outputs` is in log domain but `targets` is in linear domain.
+        Args:
+            outputs(Tensor): Batch of prediction durations in log domain (B, T)
+            targets(Tensor): Batch of groundtruth durations in linear domain (B, T)
+
+        Returns: 
+            Tensor: Mean squared error loss value.
+
+        Note: 
+            `outputs` is in log domain but `targets` is in linear domain.
        """
        # NOTE: outputs is in log domain while targets in linear
        targets = paddle.log(targets.cast(dtype='float32') + self.offset)

--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@@ -35,10 +35,8 @@ class LengthRegulator(nn.Layer):
    def __init__(self, pad_value=0.0):
        """Initilize length regulator module.

-        Parameters
-        ----------
-        pad_value : float, optional
-            Value used for padding.
+        Args:
+            pad_value (float, optional): Value used for padding.

        """
        super().__init__()
@@ -90,19 +88,13 @@ class LengthRegulator(nn.Layer):
    def forward(self, xs, ds, alpha=1.0, is_inference=False):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        xs : Tensor
-            Batch of sequences of char or phoneme embeddings (B, Tmax, D).
-        ds : Tensor(int64)
-            Batch of durations of each frame (B, T).
-        alpha : float, optional
-            Alpha value to control speed of speech.
+        Args:
+            xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
+            ds (Tensor(int64)): Batch of durations of each frame (B, T).
+            alpha (float, optional): Alpha value to control speed of speech.

-        Returns
-        ----------
-        Tensor
-            replicated input tensor based on durations (B, T*, D).
+        Returns:
+            Tensor: replicated input tensor based on durations (B, T*, D).
        """

        if alpha != 1.0:

--- a/paddlespeech/t2s/modules/predictor/variance_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py
@@ -42,18 +42,12 @@ class VariancePredictor(nn.Layer):
            dropout_rate: float=0.5, ):
        """Initilize duration predictor module.

-        Parameters
-        ----------
-        idim : int
-            Input dimension.
-        n_layers : int, optional
-            Number of convolutional layers.
-        n_chans : int, optional
-            Number of channels of convolutional layers.
-        kernel_size : int, optional
-            Kernel size of convolutional layers.
-        dropout_rate : float, optional
-            Dropout rate.
+        Args:
+            idim (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
        """
        assert check_argument_types()
        super().__init__()
@@ -79,17 +73,12 @@ class VariancePredictor(nn.Layer):
                x_masks: paddle.Tensor=None) -> paddle.Tensor:
        """Calculate forward propagation.

-        Parameters
-        ----------
-            xs : Tensor
-                Batch of input sequences (B, Tmax, idim).
-            x_masks : Tensor(bool), optional
-                Batch of masks indicating padded part (B, Tmax, 1).
+        Args:
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks (Tensor(bool), optional): Batch of masks indicating padded part (B, Tmax, 1).

-        Returns
-        ----------
-            Tensor
-                Batch of predicted sequences (B, Tmax, 1).
+        Returns:
+            Tensor: Batch of predicted sequences (B, Tmax, 1).
        """
        # (B, idim, Tmax)
        xs = xs.transpose([0, 2, 1])

--- a/paddlespeech/t2s/modules/residual_block.py
+++ b/paddlespeech/t2s/modules/residual_block.py
@@ -28,26 +28,16 @@ class WaveNetResidualBlock(nn.Layer):
    unit and parametric redidual and skip connections. For more details, 
    refer to `WaveNet: A Generative Model for Raw Audio <https://arxiv.org/abs/1609.03499>`_.

-    Parameters
-    ----------
-    kernel_size : int, optional
-        Kernel size of the 1D convolution, by default 3
-    residual_channels : int, optional
-        Feature size of the resiaudl output(and also the input), by default 64
-    gate_channels : int, optional
-        Output feature size of the 1D convolution, by default 128
-    skip_channels : int, optional
-        Feature size of the skip output, by default 64
-    aux_channels : int, optional
-        Feature size of the auxiliary input (e.g. spectrogram), by default 80
-    dropout : float, optional
-        Probability of the dropout before the 1D convolution, by default 0.
-    dilation : int, optional
-        Dilation of the 1D convolution, by default 1
-    bias : bool, optional
-        Whether to use bias in the 1D convolution, by default True
-    use_causal_conv : bool, optional
-        Whether to use causal padding for the 1D convolution, by default False
+    Args:
+        kernel_size (int, optional): Kernel size of the 1D convolution, by default 3
+        residual_channels (int, optional): Feature size of the resiaudl output(and also the input), by default 64
+        gate_channels (int, optional): Output feature size of the 1D convolution, by default 128
+        skip_channels (int, optional): Feature size of the skip output, by default 64
+        aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80
+        dropout (float, optional): Probability of the dropout before the 1D convolution, by default 0.
+        dilation (int, optional): Dilation of the 1D convolution, by default 1
+        bias (bool, optional): Whether to use bias in the 1D convolution, by default True
+        use_causal_conv (bool, optional): Whether to use causal padding for the 1D convolution, by default False
    """

    def __init__(self,
@@ -90,21 +80,15 @@ class WaveNetResidualBlock(nn.Layer):

    def forward(self, x, c):
        """
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, C_res, T), the input features.
-        c : Tensor
-            Shape (N, C_aux, T), the auxiliary input.
-
-        Returns
-        -------
-        res : Tensor
-            Shape (N, C_res, T), the residual output, which is used as the 
-            input of the next ResidualBlock in a stack of ResidualBlocks.
-        skip : Tensor
-            Shape (N, C_skip, T), the skip output, which is collected among
-            each layer in a stack of ResidualBlocks.
+        Args:
+            x (Tensor): the input features. Shape (N, C_res, T)
+            c (Tensor): the auxiliary input. Shape (N, C_aux, T)
+
+        Returns:
+            res (Tensor): Shape (N, C_res, T), the residual output, which is used as the 
+                input of the next ResidualBlock in a stack of ResidualBlocks.
+            skip (Tensor): Shape (N, C_skip, T), the skip output, which is collected among
+                each layer in a stack of ResidualBlocks.
        """
        x_input = x
        x = F.dropout(x, self.dropout, training=self.training)
@@ -136,22 +120,14 @@ class HiFiGANResidualBlock(nn.Layer):
            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1},
    ):
        """Initialize HiFiGANResidualBlock module.
-        Parameters
-        ----------
-        kernel_size : int
-            Kernel size of dilation convolution layer.
-        channels : int
-            Number of channels for convolution layer.
-        dilations : List[int]
-            List of dilation factors.
-        use_additional_convs : bool
-            Whether to use additional convolution layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels for convolution layer.
+            dilations (List[int]): List of dilation factors.
+            use_additional_convs (bool): Whether to use additional convolution layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
        """
        super().__init__()

@@ -190,14 +166,10 @@ class HiFiGANResidualBlock(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, channels, T).
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+        Returns:
+            Tensor: Output tensor (B, channels, T).
        """
        for idx in range(len(self.convs1)):
            xt = self.convs1[idx](x)

--- a/paddlespeech/t2s/modules/residual_stack.py
+++ b/paddlespeech/t2s/modules/residual_stack.py
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
--- a/paddlespeech/t2s/modules/tacotron2/attentions.py
+++ b/paddlespeech/t2s/modules/tacotron2/attentions.py
--- a/paddlespeech/t2s/modules/tacotron2/decoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/decoder.py
--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
--- a/paddlespeech/t2s/modules/tade_res_block.py
+++ b/paddlespeech/t2s/modules/tade_res_block.py
--- a/paddlespeech/t2s/modules/transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
--- a/paddlespeech/t2s/modules/transformer/decoder.py
+++ b/paddlespeech/t2s/modules/transformer/decoder.py
--- a/paddlespeech/t2s/modules/transformer/decoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
--- a/paddlespeech/t2s/modules/transformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py
--- a/paddlespeech/t2s/modules/transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
--- a/paddlespeech/t2s/modules/transformer/mask.py
+++ b/paddlespeech/t2s/modules/transformer/mask.py
--- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
--- a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
+++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
--- a/paddlespeech/t2s/modules/transformer/repeat.py
+++ b/paddlespeech/t2s/modules/transformer/repeat.py
--- a/paddlespeech/t2s/modules/transformer/subsampling.py
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
--- a/paddlespeech/t2s/modules/upsample.py
+++ b/paddlespeech/t2s/modules/upsample.py
--- a/paddlespeech/t2s/training/experiment.py
+++ b/paddlespeech/t2s/training/experiment.py
--- a/paddlespeech/t2s/training/extensions/snapshot.py
+++ b/paddlespeech/t2s/training/extensions/snapshot.py
--- a/paddlespeech/t2s/utils/error_rate.py
+++ b/paddlespeech/t2s/utils/error_rate.py
--- a/paddlespeech/t2s/utils/h5_utils.py
+++ b/paddlespeech/t2s/utils/h5_utils.py