change the docstring style from numpydoc to google, test=tts

9699c007 · 小湉湉 · 683679be · 9699c007 · 9699c007 · 9699c007
57 changed file
--- a/paddlespeech/t2s/datasets/data_table.py
+++ b/paddlespeech/t2s/datasets/data_table.py
@@ -22,25 +22,16 @@ from paddle.io import Dataset

 class DataTable(Dataset):
    """Dataset to load and convert data for general purpose.
-
-    Parameters
-    ----------
-    data : List[Dict[str, Any]]
-        Metadata, a list of meta datum, each of which is composed of 
-        several fields
-    fields : List[str], optional
-        Fields to use, if not specified, all the fields in the data are 
-        used, by default None
-    converters : Dict[str, Callable], optional
-        Converters used to process each field, by default None
-    use_cache : bool, optional
-        Whether to use cache, by default False
-
-    Raises
-    ------
-    ValueError
+    Args:
+        data (List[Dict[str, Any]]): Metadata, a list of meta datum, each of which is composed of  several fields
+        fields (List[str], optional): Fields to use, if not specified, all the fields in the data are used, by default None
+        converters (Dict[str, Callable], optional): Converters used to process each field, by default None
+        use_cache (bool, optional): Whether to use cache, by default False
+
+    Raises:
+        ValueError:
            If there is some field that does not exist in data. 
-    ValueError
+        ValueError:
            If there is some field in converters that does not exist in fields.
    """

@@ -95,15 +86,11 @@ class DataTable(Dataset):
        """Convert a meta datum to an example by applying the corresponding 
        converters to each fields requested.

-        Parameters
-        ----------
-        meta_datum : Dict[str, Any]
-            Meta datum
+        Args:
+            meta_datum (Dict[str, Any]): Meta datum

-        Returns
-        -------
-        Dict[str, Any]
-            Converted example
+        Returns:
+            Dict[str, Any]: Converted example
        """
        example = {}
        for field in self.fields:
@@ -118,16 +105,11 @@ class DataTable(Dataset):

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        """Get an example given an index.
+        Args:
+            idx (int): Index of the example to get

-        Parameters
-        ----------
-        idx : int
-            Index of the example to get
-
-        Returns
-        -------
-        Dict[str, Any]
-            A converted example
+        Returns:
+            Dict[str, Any]: A converted example
        """
        if self.use_cache and self.caches[idx] is not None:
            return self.caches[idx]

--- a/paddlespeech/t2s/datasets/preprocess_utils.py
+++ b/paddlespeech/t2s/datasets/preprocess_utils.py
@@ -18,14 +18,10 @@ import re
 def get_phn_dur(file_name):
    '''
    read MFA duration.txt
-    Parameters
-    ----------
-    file_name : str or Path
-        path of gen_duration_from_textgrid.py's result
-    Returns
-    ----------
-    Dict
-        sentence: {'utt': ([char], [int])}
+    Args:
+        file_name (str or Path): path of gen_duration_from_textgrid.py's result
+    Returns: 
+        Dict: sentence: {'utt': ([char], [int])}
    '''
    f = open(file_name, 'r')
    sentence = {}
@@ -48,10 +44,8 @@ def get_phn_dur(file_name):
 def merge_silence(sentence):
    '''
    merge silences
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': (([char], [int]), str)}
+    Args:
+        sentence (Dict): sentence: {'utt': (([char], [int]), str)}
    '''
    for utt in sentence:
        cur_phn, cur_dur, speaker = sentence[utt]
@@ -81,12 +75,9 @@ def merge_silence(sentence):
 def get_input_token(sentence, output_path, dataset="baker"):
    '''
    get phone set from training data and save it
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': ([char], [int])}
-    output_path : str or path
-        path to save phone_id_map
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], [int])}
+        output_path (str or path):path to save phone_id_map
    '''
    phn_token = set()
    for utt in sentence:
@@ -112,14 +103,10 @@ def get_phones_tones(sentence,
                     dataset="baker"):
    '''
    get phone set and tone set from training data and save it
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': ([char], [int])}
-    phones_output_path : str or path
-        path to save phone_id_map
-    tones_output_path : str or path
-        path to save tone_id_map
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], [int])}
+        phones_output_path (str or path): path to save phone_id_map
+        tones_output_path (str or path): path to save tone_id_map
    '''
    phn_token = set()
    tone_token = set()
@@ -162,14 +149,10 @@ def get_spk_id_map(speaker_set, output_path):
 def compare_duration_and_mel_length(sentences, utt, mel):
    '''
    check duration error, correct sentences[utt] if possible, else pop sentences[utt]
-    Parameters
-    ----------
-    sentences : Dict
-        sentences[utt] = [phones_list ,durations_list]
-    utt : str
-        utt_id
-    mel : np.ndarry
-        features (num_frames, n_mels)
+    Args:
+        sentences (Dict): sentences[utt] = [phones_list ,durations_list]
+        utt (str): utt_id
+        mel (np.ndarry): features (num_frames, n_mels)
    '''

    if utt in sentences:

--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@@ -29,15 +29,11 @@ class Clip(object):
            hop_size=256,
            aux_context_window=0, ):
        """Initialize customized collater for DataLoader.
+        Args:

-        Parameters
-        ----------
-        batch_max_steps : int
-            The maximum length of input signal in batch.
-        hop_size : int
-            Hop size of auxiliary features.
-        aux_context_window : int
-            Context window size for auxiliary feature conv.
+            batch_max_steps (int): The maximum length of input signal in batch.
+            hop_size (int): Hop size of auxiliary features.
+            aux_context_window (int): Context window size for auxiliary feature conv.

        """
        if batch_max_steps % hop_size != 0:
@@ -56,17 +52,14 @@ class Clip(object):
    def __call__(self, batch):
        """Convert into batch tensors.

-        Parameters
-        ----------
-        batch : list
-            list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
+        Args:
+            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).

-        Returns
-        ----------
-        Tensor
+        Returns: 
+            Tensor:
                Auxiliary feature batch (B, C, T'), where
                T = (T' - 2 * aux_context_window) * hop_size.
-        Tensor
+            Tensor:
                Target signal batch (B, 1, T).

        """
@@ -104,8 +97,7 @@ class Clip(object):
    def _adjust_length(self, x, c):
        """Adjust the audio and feature lengths.

-        Note
-        -------
+        Note:
            Basically we assume that the length of x and c are adjusted
            through preprocessing stage, but if we use other library processed
            features, this process will be needed.
@@ -162,22 +154,14 @@ class WaveRNNClip(Clip):
        # voc_pad = 2  this will pad the input so that the resnet can 'see' wider than input length
        # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15
        """Convert into batch tensors.
-
-        Parameters
-        ----------
-        batch : list
-            list of tuple of the pair of audio and features. 
-            Audio shape (T, ), features shape(T', C).
-
-        Returns
-        ----------
-        Tensor
-            Input signal batch (B, 1, T).
-        Tensor
-            Target signal batch (B, 1, T).
-        Tensor
-            Auxiliary feature batch (B, C, T'), where
-            T = (T' - 2 * aux_context_window) * hop_size.
+        Args:
+            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
+
+        Returns:
+            Tensor: Input signal batch (B, 1, T).
+            Tensor: Target signal batch (B, 1, T).
+            Tensor: Auxiliary feature batch (B, C, T'), 
+                where T = (T' - 2 * aux_context_window) * hop_size.

        """
        # check length

--- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py
+++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
@@ -31,15 +31,12 @@ from paddlespeech.t2s.frontend import English


 def get_lj_sentences(file_name, frontend):
-    '''
-    read MFA duration.txt
-    Parameters
-    ----------
-    file_name : str or Path
-    Returns
-    ----------
-    Dict
-        sentence: {'utt': ([char], [int])}
+    '''read MFA duration.txt
+
+    Args:
+        file_name (str or Path)
+    Returns:
+        Dict: sentence: {'utt': ([char], [int])}
    '''
    f = open(file_name, 'r')
    sentence = {}
@@ -59,14 +56,11 @@ def get_lj_sentences(file_name, frontend):


 def get_input_token(sentence, output_path):
-    '''
-    get phone set from training data and save it
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': ([char], str)}
-    output_path : str or path
-        path to save phone_id_map
+    '''get phone set from training data and save it
+    
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], str)}
+        output_path (str or path): path to save phone_id_map
    '''
    phn_token = set()
    for utt in sentence:

--- a/paddlespeech/t2s/frontend/arpabet.py
+++ b/paddlespeech/t2s/frontend/arpabet.py
@@ -133,16 +133,11 @@ class ARPABET(Phonetics):

    def phoneticize(self, sentence, add_start_end=False):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
+        Args:
+            sentence (str): The input text sequence.
    
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns:
+            List[str]: The list of pronunciation sequence.
        """
        phonemes = [
            self._remove_vowels(item) for item in self.backend(sentence)
@@ -157,15 +152,11 @@ class ARPABET(Phonetics):
    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.

-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
    
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Returns:
+            List[int]: The list of pronunciation id sequence.
        """
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids
@@ -173,14 +164,11 @@ class ARPABET(Phonetics):
    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
        
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
+        Args:
+            ids( List[int]): The list of pronunciation id sequence.
    
-        Returns
-        ----------
-        List[str]
+        Returns: 
+            List[str]: 
                The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]
@@ -188,15 +176,11 @@ class ARPABET(Phonetics):
    def __call__(self, sentence, add_start_end=False):
        """ Convert the input text sequence into pronunciation id sequence.
    
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
+        Args:
+            sentence (str): The input text sequence.
    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Returns:
+            List[str]: The list of pronunciation id sequence.
        """
        return self.numericalize(
            self.phoneticize(sentence, add_start_end=add_start_end))
@@ -229,15 +213,11 @@ class ARPABETWithStress(Phonetics):
    def phoneticize(self, sentence, add_start_end=False):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
    
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
+        Args: 
+            sentence (str): The input text sequence.
    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        phonemes = self.backend(sentence)
        if add_start_end:
@@ -250,46 +230,32 @@ class ARPABETWithStress(Phonetics):
    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.

-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
    
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Returns:
+            List[int]: The list of pronunciation id sequence.
        """
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids

    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
+        Args:
+            ids (List[int]): The list of pronunciation id sequence.
    
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
-    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]

    def __call__(self, sentence, add_start_end=False):
        """ Convert the input text sequence into pronunciation id sequence.
+        Args:
+            sentence (str): The input text sequence.
    
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation id sequence.
        """
        return self.numericalize(
            self.phoneticize(sentence, add_start_end=add_start_end))

--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -65,14 +65,10 @@ class English(Phonetics):

    def phoneticize(self, sentence):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        start = self.vocab.start_symbol
        end = self.vocab.end_symbol
@@ -123,14 +119,10 @@ class English(Phonetics):

    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
+        Returns: 
+            List[int]: The list of pronunciation id sequence.
        """
        ids = [
            self.vocab.lookup(item) for item in phonemes
@@ -140,27 +132,19 @@ class English(Phonetics):

    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+            ids (List[int]): The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]

    def __call__(self, sentence):
        """ Convert the input text sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Args:
+            sentence(str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation id sequence.
        """
        return self.numericalize(self.phoneticize(sentence))

@@ -183,27 +167,20 @@ class EnglishCharacter(Phonetics):

    def phoneticize(self, sentence):
        """ Normalize the input text sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        str
-            A text sequence after normalize.
+        Args:
+            sentence(str): The input text sequence.
+        Returns:
+            str: A text sequence after normalize.
        """
        words = normalize(sentence)
        return words

    def numericalize(self, sentence):
        """ Convert a text sequence into ids.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[int]
+        Args:
+            sentence (str): The input text sequence.
+        Returns:
+            List[int]:
                List of a character id sequence.
        """
        ids = [
@@ -214,27 +191,19 @@ class EnglishCharacter(Phonetics):

    def reverse(self, ids):
        """ Convert a character id sequence into text.
-        Parameters
-        -----------
-        ids: List[int]
-            List of a character id sequence.
-        Returns
-        ----------
-        str
-            The input text sequence.
+        Args:
+            ids (List[int]): List of a character id sequence.
+        Returns:
+            str: The input text sequence.
        """
        return [self.vocab.reverse(i) for i in ids]

    def __call__(self, sentence):
        """ Normalize the input text sequence and convert it into character id sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[int]
-            List of a character id sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns: 
+            List[int]: List of a character id sequence.
        """
        return self.numericalize(self.phoneticize(sentence))

@@ -264,14 +233,10 @@ class Chinese(Phonetics):

    def phoneticize(self, sentence):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+            sentence(str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        # simplified = self.opencc_backend.convert(sentence)
        simplified = sentence
@@ -296,28 +261,20 @@ class Chinese(Phonetics):

    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Args:
+            phonemes(List[str]): The list of pronunciation sequence.
+        Returns:
+                List[int]: The list of pronunciation id sequence.
        """
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids

    def __call__(self, sentence):
        """ Convert the input text sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns:
+            List[str]: The list of pronunciation id sequence.
        """
        return self.numericalize(self.phoneticize(sentence))

@@ -329,13 +286,9 @@ class Chinese(Phonetics):

    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+        ids (List[int]): The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]
--- a/paddlespeech/t2s/frontend/vocab.py
+++ b/paddlespeech/t2s/frontend/vocab.py
@@ -20,22 +20,12 @@ __all__ = ["Vocab"]
 class Vocab(object):
    """  Vocabulary.

-    Parameters
-    -----------
-    symbols: Iterable[str]
-        Common symbols.
-
-    padding_symbol: str, optional
-        Symbol for pad. Defaults to "<pad>".
-
-    unk_symbol: str, optional
-        Symbol for unknow. Defaults to "<unk>"
-
-    start_symbol: str, optional
-        Symbol for start. Defaults to "<s>"
-
-    end_symbol: str, optional
-        Symbol for end. Defaults to "</s>"
+    Args:
+        symbols (Iterable[str]): Common symbols.
+        padding_symbol (str, optional): Symbol for pad. Defaults to "<pad>".
+        unk_symbol (str, optional): Symbol for unknow. Defaults to "<unk>"
+        start_symbol (str, optional): Symbol for start. Defaults to "<s>"
+        end_symbol (str, optional): Symbol for end. Defaults to "</s>"
    """

    def __init__(self,

--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@@ -44,11 +44,9 @@ RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'

 def replace_time(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
+    Args:
+        match (re.Match)
+    Returns:
        str
    """

@@ -87,11 +85,9 @@ RE_DATE = re.compile(r'(\d{4}|\d{2})年'

 def replace_date(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
+    Args:
+        match (re.Match)
+    Returns:
        str
    """
    year = match.group(1)
@@ -114,11 +110,9 @@ RE_DATE2 = re.compile(

 def replace_date2(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
+    Args:
+        match (re.Match)
+    Returns:
        str
    """
    year = match.group(1)

--- a/paddlespeech/t2s/frontend/zh_normalization/num.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/num.py
@@ -36,11 +36,9 @@ RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')

 def replace_frac(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
+    Args:
+        match (re.Match)
+    Returns:
        str
    """
    sign = match.group(1)
@@ -59,11 +57,9 @@ RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')

 def replace_percentage(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
+    Args:
+        match (re.Match)
+    Returns:
        str
    """
    sign = match.group(1)
@@ -81,11 +77,9 @@ RE_INTEGER = re.compile(r'(-)' r'(\d+)')

 def replace_negative_num(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
+    Args:
+        match (re.Match)
+    Returns:
        str
    """
    sign = match.group(1)
@@ -103,11 +97,9 @@ RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')

 def replace_default_num(match):
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
+    Args:
+        match (re.Match)
+    Returns:
        str
    """
    number = match.group(0)
@@ -124,11 +116,9 @@ RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')

 def replace_positive_quantifier(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
+    Args:
+        match (re.Match)
+    Returns:
        str
    """
    number = match.group(1)
@@ -142,11 +132,9 @@ def replace_positive_quantifier(match) -> str:

 def replace_number(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
+    Args:
+        match (re.Match)
+    Returns:
        str
    """
    sign = match.group(1)
@@ -169,11 +157,9 @@ RE_RANGE = re.compile(

 def replace_range(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
+    Args:
+        match (re.Match)
+    Returns:
        str
    """
    first, second = match.group(1), match.group(8)

--- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
@@ -45,11 +45,9 @@ def phone2str(phone_string: str, mobile=True) -> str:

 def replace_phone(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
+    Args:
+        match (re.Match)
+    Returns:
        str
    """
    return phone2str(match.group(0), mobile=False)
@@ -57,11 +55,9 @@ def replace_phone(match) -> str:

 def replace_mobile(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
+    Args:
+        match (re.Match)
+    Returns:
        str
    """
    return phone2str(match.group(0))
--- a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
@@ -22,11 +22,9 @@ RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')

 def replace_temperature(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
+    Args:
+        match (re.Match)
+    Returns:
        str
    """
    sign = match.group(1)

--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@@ -55,14 +55,10 @@ class TextNormalizer():

    def _split(self, text: str, lang="zh") -> List[str]:
        """Split long text into sentences with sentence-splitting punctuations.
-        Parameters
-        ----------
-        text : str
-            The input text.
-        Returns
-        -------
-        List[str]
-            Sentences.
+        Args:
+            text (str): The input text.
+        Returns:
+            List[str]: Sentences.
        """
        # Only for pure Chinese here
        if lang == "zh":

--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
--- a/paddlespeech/t2s/models/hifigan/hifigan.py
+++ b/paddlespeech/t2s/models/hifigan/hifigan.py
@@ -37,34 +37,20 @@ class HiFiGANGenerator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initialize HiFiGANGenerator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        channels : int
-            Number of hidden representation channels.
-        kernel_size : int
-            Kernel size of initial and final conv layer.
-        upsample_scales : list
-            List of upsampling scales.
-        upsample_kernel_sizes : list
-            List of kernel sizes for upsampling layers.
-        resblock_kernel_sizes : list
-            List of kernel sizes for residual blocks.
-        resblock_dilations : list
-            List of dilation list for residual blocks.
-        use_additional_convs : bool
-            Whether to use additional conv layers in residual blocks.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        use_weight_norm : bool
-            Whether to use weight norm.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            channels (int): Number of hidden representation channels.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            upsample_scales (list): List of upsampling scales.
+            upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
+            resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
+            resblock_dilations (list): List of dilation list for residual blocks.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()
@@ -134,14 +120,11 @@ class HiFiGANGenerator(nn.Layer):

    def forward(self, c):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T).
+        
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T).
        """
        c = self.input_conv(c)
        for i in range(self.num_upsamples):
@@ -196,14 +179,11 @@ class HiFiGANGenerator(nn.Layer):

    def inference(self, c):
        """Perform inference.
-        Parameters
-        ----------
-        c : Tensor 
-            Input tensor (T, in_channels).
+        Args:
+            c (Tensor): Input tensor (T, in_channels).
                normalize_before (bool): Whether to perform normalization.
-        Returns
-        ----------
-        Tensor
+        Returns:
+            Tensor:
                Output tensor (T ** prod(upsample_scales), out_channels).
        """
        c = self.forward(c.transpose([1, 0]).unsqueeze(0))
@@ -229,35 +209,22 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
            use_spectral_norm: bool=False,
            init_type: str="xavier_uniform", ):
        """Initialize HiFiGANPeriodDiscriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        period : int
-            Period.
-        kernel_sizes : list
-            Kernel sizes of initial conv layers and the final conv layer.
-        channels : int
-            Number of initial channels.
-        downsample_scales : list
-            List of downsampling scales.
-        max_downsample_channels : int
-            Number of maximum downsampling channels.
-        use_additional_convs : bool
-            Whether to use additional conv layers in residual blocks.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        use_weight_norm : bool
-            Whether to use weight norm.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            period (int): Period.
+            kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer.
+            channels (int): Number of initial channels.
+            downsample_scales (list): List of downsampling scales.
+            max_downsample_channels (int): Number of maximum downsampling channels.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
                If set to true, it will be applied to all of the conv layers.
-        use_spectral_norm : bool
-            Whether to use spectral norm.
+            use_spectral_norm (bool): Whether to use spectral norm.
                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()
@@ -307,14 +274,11 @@ class HiFiGANPeriodDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        list
-            List of each layer's tensors.
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            list: List of each layer's tensors.
        """
        # transform 1d to 2d -> (B, C, T/P, P)
        b, c, t = paddle.shape(x)
@@ -379,12 +343,10 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
            },
            init_type: str="xavier_uniform", ):
        """Initialize HiFiGANMultiPeriodDiscriminator module.
-        Parameters
-        ----------
-        periods : list
-            List of periods.
-        discriminator_params : dict
-            Parameters for hifi-gan period discriminator module.
+
+        Args:
+            periods (list): List of periods.
+            discriminator_params (dict): Parameters for hifi-gan period discriminator module.
                The period parameter will be overwritten.
        """
        super().__init__()
@@ -399,14 +361,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of list of each discriminator outputs, which consists of each layer output tensors.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        """
        outs = []
        for f in self.discriminators:
@@ -434,32 +393,21 @@ class HiFiGANScaleDiscriminator(nn.Layer):
            use_spectral_norm: bool=False,
            init_type: str="xavier_uniform", ):
        """Initilize HiFiGAN scale discriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        kernel_sizes : list
-            List of four kernel sizes. The first will be used for the first conv layer,
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer,
                and the second is for downsampling part, and the remaining two are for output layers.
-        channels : int
-            Initial number of channels for conv layer.
-        max_downsample_channels : int
-            Maximum number of channels for downsampling layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        downsample_scales : list
-            List of downsampling scales.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        use_weight_norm : bool
-            Whether to use weight norm.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (list): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
                If set to true, it will be applied to all of the conv layers.
-        use_spectral_norm : bool
-            Whether to use spectral norm.
+            use_spectral_norm (bool): Whether to use spectral norm.
                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()
@@ -546,14 +494,11 @@ class HiFiGANScaleDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of output tensors of each layer.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of output tensors of each layer.
        """
        outs = []
        for f in self.layers:
@@ -613,20 +558,14 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):
            follow_official_norm: bool=False,
            init_type: str="xavier_uniform", ):
        """Initilize HiFiGAN multi-scale discriminator module.
-        Parameters
-        ----------
-        scales : int
-            Number of multi-scales.
-        downsample_pooling : str
-            Pooling module name for downsampling of the inputs.
-        downsample_pooling_params : dict
-            Parameters for the above pooling module.
-        discriminator_params : dict
-            Parameters for hifi-gan scale discriminator module.
-        follow_official_norm : bool
-            Whether to follow the norm setting of the official
-            implementaion. The first discriminator uses spectral norm and the other
-            discriminators use weight norm.
+   
+        Args:
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            downsample_pooling_params (dict): Parameters for the above pooling module.
+            discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+            follow_official_norm (bool): Whether to follow the norm setting of the official
+                implementaion. The first discriminator uses spectral norm and the other discriminators use weight norm.
        """
        super().__init__()

@@ -651,14 +590,11 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of list of each discriminator outputs, which consists of each layer output tensors.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        """
        outs = []
        for f in self.discriminators:
@@ -715,23 +651,16 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
            },
            init_type: str="xavier_uniform", ):
        """Initilize HiFiGAN multi-scale + multi-period discriminator module.
-        Parameters
-        ----------
-        scales : int
-            Number of multi-scales.
-        scale_downsample_pooling : str
-            Pooling module name for downsampling of the inputs.
-        scale_downsample_pooling_params : dict
-            Parameters for the above pooling module.
-        scale_discriminator_params : dict
-            Parameters for hifi-gan scale discriminator module.
-        follow_official_norm : bool): Whether to follow the norm setting of the official
-            implementaion. The first discriminator uses spectral norm and the other
-            discriminators use weight norm.
-        periods : list
-            List of periods.
-        period_discriminator_params : dict
-            Parameters for hifi-gan period discriminator module.
+
+        Args:
+            scales (int): Number of multi-scales.
+            scale_downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            scale_downsample_pooling_params (dict): Parameters for the above pooling module.
+            scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+            follow_official_norm （bool): Whether to follow the norm setting of the official implementaion. 
+                The first discriminator uses spectral norm and the other discriminators use weight norm.
+            periods (list): List of periods.
+            period_discriminator_params (dict): Parameters for hifi-gan period discriminator module.
                The period parameter will be overwritten.
        """
        super().__init__()
@@ -751,12 +680,10 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
            List:
                List of list of each discriminator outputs,
                which consists of each layer output tensors.

--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
@@ -51,41 +51,26 @@ class MelGANGenerator(nn.Layer):
            use_causal_conv: bool=False,
            init_type: str="xavier_uniform", ):
        """Initialize MelGANGenerator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels,
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels,
                the number of sub-band is out_channels in multi-band melgan.
-        kernel_size : int
-            Kernel size of initial and final conv layer.
-        channels : int
-            Initial number of channels for conv layer.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        upsample_scales : List[int]
-            List of upsampling scales.
-        stack_kernel_size : int
-            Kernel size of dilated conv layers in residual stack.
-        stacks : int
-            Number of stacks in a single residual stack.
-        nonlinear_activation : Optional[str], optional
-            Non linear activation in upsample network, by default None
-        nonlinear_activation_params : Dict[str, Any], optional
-            Parameters passed to the linear activation in the upsample network, 
+            kernel_size (int): Kernel size of initial and final conv layer.
+            channels (int): Initial number of channels for conv layer.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            upsample_scales (List[int]): List of upsampling scales.
+            stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
+            stacks (int): Number of stacks in a single residual stack.
+            nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+            nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
                by default {}
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : dict
-            Hyperparameters for padding function.
-        use_final_nonlinear_activation : nn.Layer
-            Activation function for the final layer.
-        use_weight_norm : bool
-            Whether to use weight norm.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params （dict): Hyperparameters for padding function.
+            use_final_nonlinear_activation (nn.Layer): Activation function for the final layer.
+            use_weight_norm (bool): Whether to use weight norm.
                If set to true, it will be applied to all of the conv layers.
-        use_causal_conv : bool
-            Whether to use causal convolution.
+            use_causal_conv (bool): Whether to use causal convolution.
        """
        super().__init__()

@@ -207,14 +192,11 @@ class MelGANGenerator(nn.Layer):

    def forward(self, c):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T ** prod(upsample_scales)).
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
        """
        out = self.melgan(c)
        return out
@@ -260,14 +242,11 @@ class MelGANGenerator(nn.Layer):

    def inference(self, c):
        """Perform inference.
-        Parameters
-        ----------
-        c : Union[Tensor, ndarray]
-            Input tensor (T, in_channels).
-        Returns
-        ----------
-        Tensor
-            Output tensor (out_channels*T ** prod(upsample_scales), 1).
+
+        Args:
+            c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
+        Returns:
+            Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1).
        """
        # pseudo batch
        c = c.transpose([1, 0]).unsqueeze(0)
@@ -298,33 +277,22 @@ class MelGANDiscriminator(nn.Layer):
            pad_params: Dict[str, Any]={"mode": "reflect"},
            init_type: str="xavier_uniform", ):
        """Initilize MelGAN discriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        kernel_sizes : List[int]
-            List of two kernel sizes. The prod will be used for the first conv layer,
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer,
                and the first and the second kernel sizes will be used for the last two layers.
                For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
                the last two layers' kernel size will be 5 and 3, respectively.
-        channels : int
-            Initial number of channels for conv layer.
-        max_downsample_channels : int
-            Maximum number of channels for downsampling layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        downsample_scales : List[int]
-            List of downsampling scales.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : dict
-            Hyperparameters for padding function.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (List[int]): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
        """
        super().__init__()

@@ -395,14 +363,10 @@ class MelGANDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of output tensors of each layer (for feat_match_loss).
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of output tensors of each layer (for feat_match_loss).
        """
        outs = []
        for f in self.layers:
@@ -440,39 +404,24 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initilize MelGAN multi-scale discriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        scales : int
-            Number of multi-scales.
-        downsample_pooling : str
-            Pooling module name for downsampling of the inputs.
-        downsample_pooling_params : dict
-            Parameters for the above pooling module.
-        kernel_sizes : List[int]
-            List of two kernel sizes. The sum will be used for the first conv layer,
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            downsample_pooling_params (dict): Parameters for the above pooling module.
+            kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer,
                and the first and the second kernel sizes will be used for the last two layers.
-        channels : int
-            Initial number of channels for conv layer.
-        max_downsample_channels : int
-            Maximum number of channels for downsampling layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        downsample_scales : List[int]
-            List of downsampling scales.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : dict
-            Hyperparameters for padding function.
-        use_causal_conv : bool
-            Whether to use causal convolution.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (List[int]): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_causal_conv (bool): Whether to use causal convolution.
        """
        super().__init__()

@@ -514,14 +463,10 @@ class MelGANMultiScaleDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of list of each discriminator outputs, which consists of each layer output tensors.
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        """
        outs = []
        for f in self.discriminators:

--- a/paddlespeech/t2s/models/melgan/style_melgan.py
+++ b/paddlespeech/t2s/models/melgan/style_melgan.py
@@ -52,36 +52,22 @@ class StyleMelGANGenerator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initilize Style MelGAN generator.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input noise channels.
-        aux_channels : int
-            Number of auxiliary input channels.
-        channels : int
-            Number of channels for conv layer.
-        out_channels : int
-            Number of output channels.
-        kernel_size : int
-            Kernel size of conv layers.
-        dilation : int
-            Dilation factor for conv layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        noise_upsample_scales : list
-            List of noise upsampling scales.
-        noise_upsample_activation : str
-            Activation function module name for noise upsampling.
-        noise_upsample_activation_params : dict
-            Hyperparameters for the above activation function.
-        upsample_scales : list
-            List of upsampling scales.
-        upsample_mode : str
-            Upsampling mode in TADE layer.
-        gated_function : str
-            Gated function in TADEResBlock ("softmax" or "sigmoid").
-        use_weight_norm : bool
-            Whether to use weight norm.
+
+        Args:
+            in_channels (int): Number of input noise channels.
+            aux_channels (int): Number of auxiliary input channels.
+            channels (int): Number of channels for conv layer.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of conv layers.
+            dilation (int): Dilation factor for conv layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            noise_upsample_scales (list): List of noise upsampling scales.
+            noise_upsample_activation (str): Activation function module name for noise upsampling.
+            noise_upsample_activation_params (dict): Hyperparameters for the above activation function.
+            upsample_scales (list): List of upsampling scales.
+            upsample_mode (str): Upsampling mode in TADE layer.
+            gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid").
+            use_weight_norm (bool): Whether to use weight norm.
                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()
@@ -147,16 +133,12 @@ class StyleMelGANGenerator(nn.Layer):

    def forward(self, c, z=None):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Auxiliary input tensor (B, channels, T).
-        z : Tensor
-            Input noise tensor (B, in_channels, 1).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T ** prod(upsample_scales)).
+
+        Args:
+            c (Tensor): Auxiliary input tensor (B, channels, T).
+            z (Tensor): Input noise tensor (B, in_channels, 1).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
        """
        # batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300)
        if z is None:
@@ -211,14 +193,10 @@ class StyleMelGANGenerator(nn.Layer):

    def inference(self, c):
        """Perform inference.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (T, in_channels).
-        Returns
-        ----------
-        Tensor
-            Output tensor (T ** prod(upsample_scales), out_channels).
+        Args:
+            c (Tensor): Input tensor (T, in_channels).
+        Returns:
+            Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
        """
        # (1, in_channels, T)
        c = c.transpose([1, 0]).unsqueeze(0)
@@ -278,18 +256,13 @@ class StyleMelGANDiscriminator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initilize Style MelGAN discriminator.
-        Parameters
-        ----------
-        repeats : int
-            Number of repititons to apply RWD.
-        window_sizes : list
-            List of random window sizes.
-        pqmf_params : list
-            List of list of Parameters for PQMF modules
-        discriminator_params : dict
-            Parameters for base discriminator module.
-        use_weight_nom : bool
-            Whether to apply weight normalization.
+
+        Args:
+            repeats (int): Number of repititons to apply RWD.
+            window_sizes (list): List of random window sizes.
+            pqmf_params (list): List of list of Parameters for PQMF modules
+            discriminator_params (dict): Parameters for base discriminator module.
+            use_weight_nom (bool): Whether to apply weight normalization.
        """
        super().__init__()

@@ -325,14 +298,10 @@ class StyleMelGANDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, 1, T).
-        Returns
-        ----------
-        List
-            List of discriminator outputs, #items in the list will be
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+        Returns:
+            List: List of discriminator outputs, #items in the list will be
                equal to repeats * #discriminators.
        """
        outs = []

--- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
+++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
@@ -31,51 +31,30 @@ from paddlespeech.t2s.modules.upsample import ConvInUpsampleNet
 class PWGGenerator(nn.Layer):
    """Wave Generator for Parallel WaveGAN

-    Parameters
-    ----------
-    in_channels : int, optional
-        Number of channels of the input waveform, by default 1
-    out_channels : int, optional
-        Number of channels of the output waveform, by default 1
-    kernel_size : int, optional
-        Kernel size of the residual blocks inside, by default 3
-    layers : int, optional
-        Number of residual blocks inside, by default 30
-    stacks : int, optional
-        The number of groups to split the residual blocks into, by default 3
-        Within each group, the dilation of the residual block grows 
-        exponentially.
-    residual_channels : int, optional
-        Residual channel of the residual blocks, by default 64
-    gate_channels : int, optional
-        Gate channel of the residual blocks, by default 128
-    skip_channels : int, optional
-        Skip channel of the residual blocks, by default 64
-    aux_channels : int, optional
-        Auxiliary channel of the residual blocks, by default 80
-    aux_context_window : int, optional
-        The context window size of the first convolution applied to the 
+    Args:
+        in_channels (int, optional): Number of channels of the input waveform, by default 1
+        out_channels (int, optional): Number of channels of the output waveform, by default 1
+        kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3
+        layers (int, optional): Number of residual blocks inside, by default 30
+        stacks (int, optional): The number of groups to split the residual blocks into, by default 3
+            Within each group, the dilation of the residual block grows exponentially.
+        residual_channels (int, optional): Residual channel of the residual blocks, by default 64
+        gate_channels (int, optional): Gate channel of the residual blocks, by default 128
+        skip_channels (int, optional): Skip channel of the residual blocks, by default 64
+        aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80
+        aux_context_window (int, optional): The context window size of the first convolution applied to the 
            auxiliary input, by default 2
-    dropout : float, optional
-        Dropout of the residual blocks, by default 0.
-    bias : bool, optional
-        Whether to use bias in residual blocks, by default True
-    use_weight_norm : bool, optional
-        Whether to use weight norm in all convolutions, by default True
-    use_causal_conv : bool, optional
-        Whether to use causal padding in the upsample network and residual 
+        dropout (float, optional): Dropout of the residual blocks, by default 0.
+        bias (bool, optional): Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True
+        use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual 
            blocks, by default False
-    upsample_scales : List[int], optional
-        Upsample scales of the upsample network, by default [4, 4, 4, 4]
-    nonlinear_activation : Optional[str], optional
-        Non linear activation in upsample network, by default None
-    nonlinear_activation_params : Dict[str, Any], optional
-        Parameters passed to the linear activation in the upsample network, 
+        upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4]
+        nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
            by default {}
-    interpolate_mode : str, optional
-        Interpolation mode of the upsample network, by default "nearest"
-    freq_axis_kernel_size : int, optional
-        Kernel size along the frequency axis of the upsample network, by default 1
+        interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest"
+        freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1
    """

    def __init__(
@@ -167,18 +146,13 @@ class PWGGenerator(nn.Layer):
    def forward(self, x, c):
        """Generate waveform.

-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, C_in, T), The input waveform.
-        c : Tensor
-            Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It 
+        Args:
+            x(Tensor): Shape (N, C_in, T), The input waveform.
+            c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It
            is upsampled to match the time resolution of the input.

-        Returns
-        -------
-        Tensor
-            Shape (N, C_out, T), the generated waveform.
+        Returns:
+            Tensor: Shape (N, C_out, T), the generated waveform.
        """
        c = self.upsample_net(c)
        assert c.shape[-1] == x.shape[-1]
@@ -218,19 +192,14 @@ class PWGGenerator(nn.Layer):
        self.apply(_remove_weight_norm)

    def inference(self, c=None):
-        """Waveform generation. This function is used for single instance 
-        inference.
-        Parameters
-        ----------
-        c : Tensor, optional
-            Shape (T', C_aux), the auxiliary input, by default None
-        x : Tensor, optional
-            Shape (T, C_in), the noise waveform, by default None
-            If not provided, a sample is drawn from a gaussian distribution.
-        Returns
-        -------
-        Tensor
-            Shape (T, C_out), the generated waveform
+        """Waveform generation. This function is used for single instance inference.
+
+        Args:
+            c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None
+            x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None
+
+        Returns:
+            Tensor: Shape (T, C_out), the generated waveform
        """
        # when to static, can not input x, see https://github.com/PaddlePaddle/Parakeet/pull/132/files
        x = paddle.randn(
@@ -244,31 +213,20 @@ class PWGGenerator(nn.Layer):
 class PWGDiscriminator(nn.Layer):
    """A convolutional discriminator for audio.

-    Parameters
-    ----------
-    in_channels : int, optional
-        Number of channels of the input audio, by default 1
-    out_channels : int, optional
-        Output feature size, by default 1
-    kernel_size : int, optional
-        Kernel size of convolutional sublayers, by default 3
-    layers : int, optional
-        Number of layers, by default 10
-    conv_channels : int, optional
-        Feature size of the convolutional sublayers, by default 64
-    dilation_factor : int, optional
-        The factor with which dilation of each convolutional sublayers grows 
-        exponentially if it is greater than 1, else the dilation of each 
-        convolutional sublayers grows linearly, by default 1
-    nonlinear_activation : str, optional
-        The activation after each convolutional sublayer, by default "leakyrelu"
-    nonlinear_activation_params : Dict[str, Any], optional
-        The parameters passed to the activation's initializer, by default 
+    Args:
+        in_channels (int, optional): Number of channels of the input audio, by default 1
+        out_channels (int, optional): Output feature size, by default 1
+        kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3
+        layers (int, optional): Number of layers, by default 10
+        conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64
+        dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows 
+            exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly, 
+            by default 1
+        nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu"
+        nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default 
            {"negative_slope": 0.2}
-    bias : bool, optional
-        Whether to use bias in convolutional sublayers, by default True
-    use_weight_norm : bool, optional
-        Whether to use weight normalization at all convolutional sublayers, 
+        bias (bool, optional): Whether to use bias in convolutional sublayers, by default True
+        use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers, 
            by default True
    """

@@ -330,15 +288,12 @@ class PWGDiscriminator(nn.Layer):

    def forward(self, x):
        """
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, in_channels, num_samples), the input audio.
-
-        Returns
-        -------
-        Tensor
-            Shape (N, out_channels, num_samples), the predicted logits.
+
+        Args:
+            x (Tensor): Shape (N, in_channels, num_samples), the input audio.
+
+        Returns:
+            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
        """
        return self.conv_layers(x)

@@ -362,39 +317,25 @@ class PWGDiscriminator(nn.Layer):
 class ResidualPWGDiscriminator(nn.Layer):
    """A wavenet-style discriminator for audio.

-    Parameters
-    ----------
-    in_channels : int, optional
-        Number of channels of the input audio, by default 1
-    out_channels : int, optional
-        Output feature size, by default 1
-    kernel_size : int, optional
-        Kernel size of residual blocks, by default 3
-    layers : int, optional
-        Number of residual blocks, by default 30
-    stacks : int, optional
-        Number of groups of residual blocks, within which the dilation 
+    Args:
+        in_channels (int, optional): Number of channels of the input audio, by default 1
+        out_channels (int, optional): Output feature size, by default 1
+        kernel_size (int, optional): Kernel size of residual blocks, by default 3
+        layers (int, optional): Number of residual blocks, by default 30
+        stacks (int, optional): Number of groups of residual blocks, within which the dilation 
            of each residual blocks grows exponentially, by default 3
-    residual_channels : int, optional
-        Residual channels of residual blocks, by default 64
-    gate_channels : int, optional
-        Gate channels of residual blocks, by default 128
-    skip_channels : int, optional
-        Skip channels of residual blocks, by default 64
-    dropout : float, optional
-        Dropout probability of residual blocks, by default 0.
-    bias : bool, optional
-        Whether to use bias in residual blocks, by default True
-    use_weight_norm : bool, optional
-        Whether to use weight normalization in all convolutional layers, 
+        residual_channels (int, optional): Residual channels of residual blocks, by default 64
+        gate_channels (int, optional): Gate channels of residual blocks, by default 128
+        skip_channels (int, optional): Skip channels of residual blocks, by default 64
+        dropout (float, optional): Dropout probability of residual blocks, by default 0.
+        bias (bool, optional): Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers, 
            by default True
-    use_causal_conv : bool, optional
-        Whether to use causal convolution in residual blocks, by default False
-    nonlinear_activation : str, optional
-        Activation after convolutions other than those in residual blocks, 
+        use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False
+        nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks, 
            by default "leakyrelu"
-    nonlinear_activation_params : Dict[str, Any], optional
-        Parameters to pass to the activation, by default {"negative_slope": 0.2}
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation, 
+            by default {"negative_slope": 0.2}
    """

    def __init__(
@@ -463,15 +404,11 @@ class ResidualPWGDiscriminator(nn.Layer):

    def forward(self, x):
        """
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, in_channels, num_samples), the input audio.
-
-        Returns
-        -------
-        Tensor
-            Shape (N, out_channels, num_samples), the predicted logits.
+        Args:
+            x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩
+
+        Returns:
+            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
        """
        x = self.first_conv(x)
        skip = 0

--- a/paddlespeech/t2s/models/tacotron2/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py
@@ -81,69 +81,39 @@ class Tacotron2(nn.Layer):
            # training related
            init_type: str="xavier_uniform", ):
        """Initialize Tacotron2 module.
-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        odim : int
-            Dimension of the outputs.
-        embed_dim : int
-            Dimension of the token embedding.
-        elayers : int
-            Number of encoder blstm layers.
-        eunits : int
-            Number of encoder blstm units.
-        econv_layers : int
-            Number of encoder conv layers.
-        econv_filts : int
-            Number of encoder conv filter size.
-        econv_chans : int
-            Number of encoder conv filter channels.
-        dlayers : int
-            Number of decoder lstm layers.
-        dunits : int
-            Number of decoder lstm units.
-        prenet_layers : int
-            Number of prenet layers.
-        prenet_units : int
-            Number of prenet units.
-        postnet_layers : int
-            Number of postnet layers.
-        postnet_filts : int
-            Number of postnet filter size.
-        postnet_chans : int
-            Number of postnet filter channels.
-        output_activation : str
-            Name of activation function for outputs.
-        adim : int
-            Number of dimension of mlp in attention.
-        aconv_chans : int
-            Number of attention conv filter channels.
-        aconv_filts : int
-            Number of attention conv filter size.
-        cumulate_att_w : bool
-            Whether to cumulate previous attention weight.
-        use_batch_norm : bool
-            Whether to use batch normalization.
-        use_concate : bool
-            Whether to concat enc outputs w/ dec lstm outputs.
-        reduction_factor : int
-            Reduction factor.
-        spk_num : Optional[int]
-            Number of speakers. If set to > 1, assume that the
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            embed_dim (int): Dimension of the token embedding.
+            elayers (int): Number of encoder blstm layers.
+            eunits (int): Number of encoder blstm units.
+            econv_layers (int): Number of encoder conv layers.
+            econv_filts (int): Number of encoder conv filter size.
+            econv_chans (int): Number of encoder conv filter channels.
+            dlayers (int): Number of decoder lstm layers.
+            dunits (int): Number of decoder lstm units.
+            prenet_layers (int): Number of prenet layers.
+            prenet_units (int): Number of prenet units.
+            postnet_layers (int): Number of postnet layers.
+            postnet_filts (int): Number of postnet filter size.
+            postnet_chans (int): Number of postnet filter channels.
+            output_activation (str): Name of activation function for outputs.
+            adim (int): Number of dimension of mlp in attention.
+            aconv_chans (int): Number of attention conv filter channels.
+            aconv_filts (int): Number of attention conv filter size.
+            cumulate_att_w (bool): Whether to cumulate previous attention weight.
+            use_batch_norm (bool): Whether to use batch normalization.
+            use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs.
+            reduction_factor (int): Reduction factor.
+            spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the
                sids will be provided as the input and use sid embedding layer.
-        lang_num : Optional[int]
-            Number of languages. If set to > 1, assume that the
+            lang_num (Optional[int]): Number of languages. If set to > 1, assume that the
                lids will be provided as the input and use sid embedding layer.
-        spk_embed_dim : Optional[int]
-            Speaker embedding dimension. If set to > 0,
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
                assume that spk_emb will be provided as the input.
-        spk_embed_integration_type : str
-            How to integrate speaker embedding.
-        dropout_rate : float
-            Dropout rate.
-        zoneout_rate : float
-            Zoneout rate.
+            spk_embed_integration_type (str): How to integrate speaker embedding.
+            dropout_rate (float): Dropout rate.
+            zoneout_rate (float): Zoneout rate.
        """
        assert check_argument_types()
        super().__init__()
@@ -258,31 +228,19 @@ class Tacotron2(nn.Layer):
    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
        """Calculate forward propagation.

-        Parameters
-        ----------
-        text : Tensor(int64)
-            Batch of padded character ids (B, T_text).
-        text_lengths : Tensor(int64)
-            Batch of lengths of each input batch (B,).
-        speech : Tensor
-            Batch of padded target features (B, T_feats, odim).
-        speech_lengths : Tensor(int64)
-            Batch of the lengths of each target (B,).
-        spk_emb : Optional[Tensor]
-            Batch of speaker embeddings (B, spk_embed_dim).
-        spk_id : Optional[Tensor]
-            Batch of speaker IDs (B, 1).
-        lang_id : Optional[Tensor]
-            Batch of language IDs (B, 1).
-
-        Returns
-        ----------
-        Tensor
-            Loss scalar value.
-        Dict
-            Statistics to be monitored.
-        Tensor
-            Weight value if not joint training else model outputs.
+        Args:
+            text (Tensor(int64)): Batch of padded character ids (B, T_text).
+            text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,).
+            speech (Tensor): Batch of padded target features (B, T_feats, odim).
+            speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,).
+            spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
+            spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1).
+            lang_id (Optional[Tensor]): Batch of language IDs (B, 1).
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
+            Tensor: Weight value if not joint training else model outputs.

        """
        text = text[:, :text_lengths.max()]
@@ -369,35 +327,21 @@ class Tacotron2(nn.Layer):
            use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
        """Generate the sequence of features given the sequences of characters.

-        Parameters
-        ----------
-        text Tensor(int64)
-            Input sequence of characters (T_text,).
-        speech : Optional[Tensor]
-            Feature sequence to extract style (N, idim).
-        spk_emb : ptional[Tensor]
-            Speaker embedding (spk_embed_dim,).
-        spk_id : Optional[Tensor]
-            Speaker ID (1,).
-        lang_id : Optional[Tensor]
-            Language ID (1,).
-        threshold : float
-            Threshold in inference.
-        minlenratio : float
-            Minimum length ratio in inference.
-        maxlenratio : float
-            Maximum length ratio in inference.
-        use_att_constraint : bool
-            Whether to apply attention constraint.
-        backward_window : int
-            Backward window in attention constraint.
-        forward_window : int
-            Forward window in attention constraint.
-        use_teacher_forcing : bool
-            Whether to use teacher forcing.
-
-        Return
-        ----------
+        Args:
+            text (Tensor(int64)): Input sequence of characters (T_text,).
+            speech (Optional[Tensor]): Feature sequence to extract style (N, idim).
+            spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,).
+            spk_id (Optional[Tensor]): Speaker ID (1,).
+            lang_id (Optional[Tensor]): Language ID (1,).
+            threshold (float): Threshold in inference.
+            minlenratio (float): Minimum length ratio in inference.
+            maxlenratio (float): Maximum length ratio in inference.
+            use_att_constraint (bool): Whether to apply attention constraint.
+            backward_window (int): Backward window in attention constraint.
+            forward_window (int): Forward window in attention constraint.
+            use_teacher_forcing (bool): Whether to use teacher forcing.
+
+        Returns:
            Dict[str, Tensor]
            Output dict including the following items:
                * feat_gen (Tensor): Output sequence of features (T_feats, odim).
@@ -458,17 +402,12 @@ class Tacotron2(nn.Layer):
                                  spk_emb: paddle.Tensor) -> paddle.Tensor:
        """Integrate speaker embedding with hidden states.

-        Parameters
-        ----------
-         hs : Tensor
-            Batch of hidden state sequences (B, Tmax, eunits).
-         spk_emb : Tensor
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-         Tensor
-            Batch of integrated hidden state sequences (B, Tmax, eunits) if
+        Args:
+            hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits).
+            spk_emb (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if
                integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).

        """

--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -48,126 +48,66 @@ class TransformerTTS(nn.Layer):
    .. _`Neural Speech Synthesis with Transformer Network`:
        https://arxiv.org/pdf/1809.08895.pdf

-    Parameters
-    ----------
-    idim : int
-        Dimension of the inputs.
-    odim : int
-        Dimension of the outputs.
-    embed_dim : int, optional
-        Dimension of character embedding.
-    eprenet_conv_layers : int, optional
-        Number of encoder prenet convolution layers.
-    eprenet_conv_chans : int, optional
-        Number of encoder prenet convolution channels.
-    eprenet_conv_filts : int, optional
-        Filter size of encoder prenet convolution.
-    dprenet_layers : int, optional
-        Number of decoder prenet layers.
-    dprenet_units : int, optional
-        Number of decoder prenet hidden units.
-    elayers : int, optional
-        Number of encoder layers.
-    eunits : int, optional
-        Number of encoder hidden units.
-    adim : int, optional
-        Number of attention transformation dimensions.
-    aheads : int, optional
-        Number of heads for multi head attention.
-    dlayers : int, optional
-        Number of decoder layers.
-    dunits : int, optional
-        Number of decoder hidden units.
-    postnet_layers : int, optional
-        Number of postnet layers.
-    postnet_chans : int, optional
-        Number of postnet channels.
-    postnet_filts : int, optional
-        Filter size of postnet.
-    use_scaled_pos_enc : pool, optional
-        Whether to use trainable scaled positional encoding.
-    use_batch_norm : bool, optional
-        Whether to use batch normalization in encoder prenet.
-    encoder_normalize_before : bool, optional
-        Whether to perform layer normalization before encoder block.
-    decoder_normalize_before : bool, optional
-        Whether to perform layer normalization before decoder block.
-    encoder_concat_after : bool, optional
-        Whether to concatenate attention layer's input and output in encoder.
-    decoder_concat_after : bool, optional
-        Whether to concatenate attention layer's input and output in decoder.
-    positionwise_layer_type : str, optional
-        Position-wise operation type.
-    positionwise_conv_kernel_size : int, optional
-        Kernel size in position wise conv 1d.
-    reduction_factor : int, optional
-        Reduction factor.
-    spk_embed_dim : int, optional
-        Number of speaker embedding dimenstions.
-    spk_embed_integration_type : str, optional
-        How to integrate speaker embedding.
-    use_gst : str, optional
-        Whether to use global style token.
-    gst_tokens : int, optional
-        The number of GST embeddings.
-    gst_heads : int, optional
-        The number of heads in GST multihead attention.
-    gst_conv_layers : int, optional
-        The number of conv layers in GST.
-    gst_conv_chans_list : Sequence[int], optional
-            List of the number of channels of conv layers in GST.
-    gst_conv_kernel_size : int, optional
-        Kernal size of conv layers in GST.
-    gst_conv_stride : int, optional
-        Stride size of conv layers in GST.
-    gst_gru_layers : int, optional
-        The number of GRU layers in GST.
-    gst_gru_units : int, optional
-        The number of GRU units in GST.
-    transformer_lr : float, optional
-        Initial value of learning rate.
-    transformer_warmup_steps : int, optional
-        Optimizer warmup steps.
-    transformer_enc_dropout_rate : float, optional
-        Dropout rate in encoder except attention and positional encoding.
-    transformer_enc_positional_dropout_rate : float, optional
-        Dropout rate after encoder positional encoding.
-    transformer_enc_attn_dropout_rate : float, optional
-        Dropout rate in encoder self-attention module.
-    transformer_dec_dropout_rate : float, optional
-        Dropout rate in decoder except attention & positional encoding.
-    transformer_dec_positional_dropout_rate : float, optional
-        Dropout rate after decoder positional encoding.
-    transformer_dec_attn_dropout_rate : float, optional
-        Dropout rate in deocoder self-attention module.
-    transformer_enc_dec_attn_dropout_rate : float, optional
-        Dropout rate in encoder-deocoder attention module.
-    init_type : str, optional
-        How to initialize transformer parameters.
-    init_enc_alpha : float, optional
-        Initial value of alpha in scaled pos encoding of the encoder.
-    init_dec_alpha : float, optional
-        Initial value of alpha in scaled pos encoding of the decoder.
-    eprenet_dropout_rate : float, optional
-        Dropout rate in encoder prenet.
-    dprenet_dropout_rate : float, optional
-        Dropout rate in decoder prenet.
-    postnet_dropout_rate : float, optional
-        Dropout rate in postnet.
-    use_masking : bool, optional
-        Whether to apply masking for padded part in loss calculation.
-    use_weighted_masking : bool, optional
-        Whether to apply weighted masking in loss calculation.
-    bce_pos_weight : float, optional
-        Positive sample weight in bce calculation (only for use_masking=true).
-    loss_type : str, optional
-        How to calculate loss.
-    use_guided_attn_loss : bool, optional
-        Whether to use guided attention loss.
-    num_heads_applied_guided_attn : int, optional
-        Number of heads in each layer to apply guided attention loss.
-    num_layers_applied_guided_attn : int, optional
-        Number of layers to apply guided attention loss.
+    Args:
+        idim (int): Dimension of the inputs.
+        odim (int): Dimension of the outputs.
+        embed_dim (int, optional): Dimension of character embedding.
+        eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers.
+        eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels.
+        eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution.
+        dprenet_layers (int, optional): Number of decoder prenet layers.
+        dprenet_units (int, optional): Number of decoder prenet hidden units.
+        elayers (int, optional): Number of encoder layers.
+        eunits (int, optional): Number of encoder hidden units.
+        adim (int, optional): Number of attention transformation dimensions.
+        aheads (int, optional): Number of heads for multi head attention.
+        dlayers (int, optional): Number of decoder layers.
+        dunits (int, optional): Number of decoder hidden units.
+        postnet_layers (int, optional): Number of postnet layers.
+        postnet_chans (int, optional): Number of postnet channels.
+        postnet_filts (int, optional): Filter size of postnet.
+        use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding.
+        use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet.
+        encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block.
+        decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block.
+        encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder.
+        decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder.
+        positionwise_layer_type (str, optional): Position-wise operation type.
+        positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d.
+        reduction_factor (int, optional): Reduction factor.
+        spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
+        spk_embed_integration_type (str, optional): How to integrate speaker embedding.
+        use_gst (str, optional): Whether to use global style token.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        gst_conv_layers (int, optional): The number of conv layers in GST.
+        gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST.
+        gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
+        gst_conv_stride (int, optional): Stride size of conv layers in GST.
+        gst_gru_layers (int, optional): The number of GRU layers in GST.
+        gst_gru_units (int, optional): The number of GRU units in GST.
+        transformer_lr (float, optional): Initial value of learning rate.
+        transformer_warmup_steps (int, optional): Optimizer warmup steps.
+        transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding.
+        transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding.
+        transformer_enc_attn_dropout_rate （float, optional): Dropout rate in encoder self-attention module.
+        transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding.
+        transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding.
+        transformer_dec_attn_dropout_rate （float, optional): Dropout rate in deocoder self-attention module.
+        transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module.
+        init_type (str, optional): How to initialize transformer parameters.
+        init_enc_alpha （float, optional）: Initial value of alpha in scaled pos encoding of the encoder.
+        init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder.
+        eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet.
+        dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet.
+        postnet_dropout_rate (float, optional): Dropout rate in postnet.
+        use_masking (bool, optional): Whether to apply masking for padded part in loss calculation.
+        use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation.
+        bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true).
+        loss_type (str, optional): How to calculate loss.
+        use_guided_attn_loss (bool, optional): Whether to use guided attention loss.
+        num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss.
+        num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss.
            List of module names to apply guided attention loss.
    """

@@ -398,25 +338,16 @@ class TransformerTTS(nn.Layer):
    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
        """Calculate forward propagation.

-        Parameters
-        ----------
-        text : Tensor(int64)
-            Batch of padded character ids (B, Tmax).
-        text_lengths : Tensor(int64)
-            Batch of lengths of each input batch (B,).
-        speech : Tensor
-            Batch of padded target features (B, Lmax, odim).
-        speech_lengths : Tensor(int64)
-            Batch of the lengths of each target (B,).
-        spk_emb : Tensor, optional
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-        Tensor
-            Loss scalar value.
-        Dict
-            Statistics to be monitored.
+        Args:
+            text(Tensor(int64)): Batch of padded character ids (B, Tmax).
+            text_lengths(Tensor(int64)): Batch of lengths of each input batch (B,).
+            speech(Tensor): Batch of padded target features (B, Lmax, odim).
+            speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
+            spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.

        """
        # input of embedding must be int64
@@ -525,31 +456,19 @@ class TransformerTTS(nn.Layer):
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Generate the sequence of features given the sequences of characters.

-        Parameters
-        ----------
-        text : Tensor(int64)
-            Input sequence of characters (T,).
-        speech : Tensor, optional
-            Feature sequence to extract style (N, idim).
-        spk_emb : Tensor, optional
-            Speaker embedding vector (spk_embed_dim,).
-        threshold : float, optional
-            Threshold in inference.
-        minlenratio : float, optional
-            Minimum length ratio in inference.
-        maxlenratio : float, optional
-            Maximum length ratio in inference.
-        use_teacher_forcing : bool, optional
-            Whether to use teacher forcing.
-
-        Returns
-        ----------
-        Tensor
-            Output sequence of features (L, odim).
-        Tensor
-            Output sequence of stop probabilities (L,).
-        Tensor
-            Encoder-decoder (source) attention weights (#layers, #heads, L, T).
+        Args:
+            text(Tensor(int64)): Input sequence of characters (T,).
+            speech(Tensor, optional): Feature sequence to extract style (N, idim).
+            spk_emb(Tensor, optional): Speaker embedding vector (spk_embed_dim,).
+            threshold(float, optional): Threshold in inference.
+            minlenratio(float, optional): Minimum length ratio in inference.
+            maxlenratio(float, optional): Maximum length ratio in inference.
+            use_teacher_forcing(bool, optional): Whether to use teacher forcing.
+
+        Returns:
+            Tensor: Output sequence of features (L, odim).
+            Tensor: Output sequence of stop probabilities (L,).
+            Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T).

        """
        # input of embedding must be int64
@@ -671,19 +590,13 @@ class TransformerTTS(nn.Layer):
    def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
        """Make masks for self-attention.

-        Parameters
-        ----------
-        ilens : Tensor
-            Batch of lengths (B,).
+        Args:
+            ilens(Tensor): Batch of lengths (B,).

-        Returns
-        -------
-        Tensor
-            Mask tensor for self-attention.
-            dtype=paddle.bool
+        Returns:
+            Tensor: Mask tensor for self-attention. dtype=paddle.bool

-        Examples
-        -------
+        Examples:
            >>> ilens = [5, 3]
            >>> self._source_mask(ilens)
            tensor([[[1, 1, 1, 1, 1],
@@ -696,18 +609,13 @@ class TransformerTTS(nn.Layer):
    def _target_mask(self, olens: paddle.Tensor) -> paddle.Tensor:
        """Make masks for masked self-attention.

-        Parameters
-        ----------
-            olens : LongTensor
-                Batch of lengths (B,).
+        Args:
+            olens (Tensor(int64)): Batch of lengths (B,).

-        Returns
-        ----------
-        Tensor
-            Mask tensor for masked self-attention.
+        Returns:
+            Tensor: Mask tensor for masked self-attention.

-        Examples
-        ----------
+        Examples:
            >>> olens = [5, 3]
            >>> self._target_mask(olens)
            tensor([[[1, 0, 0, 0, 0],
@@ -731,17 +639,12 @@ class TransformerTTS(nn.Layer):
                                  spk_emb: paddle.Tensor) -> paddle.Tensor:
        """Integrate speaker embedding with hidden states.

-        Parameters
-        ----------
-        hs : Tensor
-            Batch of hidden state sequences (B, Tmax, adim).
-        spk_emb : Tensor
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-        Tensor
-            Batch of integrated hidden state sequences (B, Tmax, adim).
+        Args:
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).

        """
        if self.spk_embed_integration_type == "add":

--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
--- a/paddlespeech/t2s/models/wavernn/wavernn.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn.py
@@ -67,14 +67,10 @@ class MelResNet(nn.Layer):

    def forward(self, x):
        '''
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_dims, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, res_out_dims, T).
+        Args:
+            x (Tensor): Input tensor (B, in_dims, T).
+        Returns:
+            Tensor: Output tensor (B, res_out_dims, T).
        '''

        x = self.conv_in(x)
@@ -121,16 +117,11 @@ class UpsampleNetwork(nn.Layer):

    def forward(self, m):
        '''
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, C_aux, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
-        Tensor
-            Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
+        Args:
+            c (Tensor): Input tensor (B, C_aux, T).
+        Returns:
+            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
+            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
        '''
        # aux: [B, C_aux, T] 
        # -> [B, res_out_dims, T - 2 * aux_context_window]
@@ -172,32 +163,20 @@ class WaveRNN(nn.Layer):
            mode='RAW',
            init_type: str="xavier_uniform", ):
        '''
-        Parameters
-        ----------
-        rnn_dims : int, optional
-            Hidden dims of RNN Layers.
-        fc_dims : int, optional
-             Dims of FC Layers.
-        bits : int, optional
-            bit depth of signal.
-        aux_context_window : int, optional
-            The context window size of the first convolution applied to the 
+        Args:
+            rnn_dims (int, optional): Hidden dims of RNN Layers.
+            fc_dims (int, optional): Dims of FC Layers.
+            bits (int, optional): bit depth of signal.
+            aux_context_window (int, optional): The context window size of the first convolution applied to the 
                auxiliary input, by default 2
-        upsample_scales : List[int], optional
-            Upsample scales of the upsample network.
-        aux_channels : int, optional
-            Auxiliary channel of the residual blocks.
-        compute_dims : int, optional
-            Dims of Conv1D in MelResNet.
-        res_out_dims : int, optional
-            Dims of output in MelResNet.
-        res_blocks : int, optional
-            Number of residual blocks.
-        mode : str, optional
-            Output mode of the WaveRNN vocoder. `MOL` for Mixture of Logistic Distribution,
-            and `RAW` for quantized bits as the model's output.
-        init_type : str
-            How to initialize parameters.
+            upsample_scales (List[int], optional): Upsample scales of the upsample network.
+            aux_channels (int, optional): Auxiliary channel of the residual blocks.
+            compute_dims (int, optional): Dims of Conv1D in MelResNet.
+            res_out_dims (int, optional): Dims of output in MelResNet.
+            res_blocks (int, optional): Number of residual blocks.
+            mode (str, optional): Output mode of the WaveRNN vocoder. 
+                `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output.
+            init_type (str): How to initialize parameters.
        '''
        super().__init__()
        self.mode = mode
@@ -245,18 +224,13 @@ class WaveRNN(nn.Layer):

    def forward(self, x, c):
        '''
-        Parameters
-        ----------
-        x : Tensor
-            wav sequence, [B, T]
-        c : Tensor
-            mel spectrogram [B, C_aux, T']
+        Args:
+            x (Tensor): wav sequence, [B, T]
+            c (Tensor): mel spectrogram [B, C_aux, T']

            T = (T' - 2 * aux_context_window ) * hop_length
-        Returns
-        ----------
-        Tensor
-            [B, T, n_classes]
+        Returns:
+            Tensor: [B, T, n_classes]
        '''
        # Although we `_flatten_parameters()` on init, when using DataParallel
        # the model gets replicated, making it no longer guaranteed that the
@@ -304,22 +278,14 @@ class WaveRNN(nn.Layer):
                 mu_law: bool=True,
                 gen_display: bool=False):
        """
-        Parameters
-        ----------
-        c : Tensor
-            input mels, (T', C_aux)
-        batched : bool
-            generate in batch or not
-        target : int
-            target number of samples to be generated in each batch entry
-        overlap : int
-            number of samples for crossfading between batches
-        mu_law : bool
-            use mu law or not
-        Returns
-        ----------
-        wav sequence
-            Output (T' * prod(upsample_scales), out_channels, C_out).
+        Args:
+            c(Tensor): input mels, (T', C_aux)
+            batched(bool): generate in batch or not
+            target(int): target number of samples to be generated in each batch entry
+            overlap(int): number of samples for crossfading between batches
+            mu_law(bool)
+        Returns: 
+            wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
        """

        self.eval()
@@ -434,15 +400,12 @@ class WaveRNN(nn.Layer):

    def pad_tensor(self, x, pad, side='both'):
        '''
-        Parameters
-        ----------
-        x : Tensor
-            mel, [1, n_frames, 80]
-        pad : int
-        side : str 
-            'both', 'before' or 'after'
-        Returns
-        ----------
+        Args:
+            x(Tensor): mel, [1, n_frames, 80]
+            pad(int): 
+            side(str, optional):  (Default value = 'both')
+
+        Returns:
            Tensor
        '''
        b, t, _ = paddle.shape(x)
@@ -461,33 +424,24 @@ class WaveRNN(nn.Layer):
        Fold the tensor with overlap for quick batched inference.
        Overlap will be used for crossfading in xfade_and_unfold()

-        Parameters
-        ----------
-        x : Tensor
-            Upsampled conditioning features. mels or aux
+        Args:
+            x(Tensor): Upsampled conditioning features. mels or aux
                shape=(1, T, features)
                mels: [1, T, 80]
                aux: [1, T, 128]
-        target : int
-            Target timesteps for each index of batch
-        overlap : int
-            Timesteps for both xfade and rnn warmup
-            overlap = hop_length * 2
-
-        Returns
-        ----------
-        Tensor 
+            target(int): Target timesteps for each index of batch
+            overlap(int): Timesteps for both xfade and rnn warmup
+
+        Returns:
+            Tensor: 
                shape=(num_folds, target + 2 * overlap, features)
                num_flods = (time_seq - overlap) // (target + overlap)
                mel: [num_folds, target + 2 * overlap, 80]
                aux: [num_folds, target + 2 * overlap, 128]

-        Details
-        ----------
+        Details:
            x = [[h1, h2, ... hn]]
-
            Where each h is a vector of conditioning features
-
            Eg: target=2, overlap=1 with x.size(1)=10

            folded = [[h1, h2, h3, h4],
@@ -520,24 +474,20 @@ class WaveRNN(nn.Layer):
    def xfade_and_unfold(self, y, target: int=12000, overlap: int=600):
        ''' Applies a crossfade and unfolds into a 1d array.

-        Parameters
-        ----------
-        y : Tensor
+        Args:
+            y (Tensor): 
                Batched sequences of audio samples
                shape=(num_folds, target + 2 * overlap)
                dtype=paddle.float32
-        overlap : int
-            Timesteps for both xfade and rnn warmup
+            overlap (int): Timesteps for both xfade and rnn warmup

-        Returns
-        ----------
+        Returns:
            Tensor
                audio samples in a 1d array
                shape=(total_len)
                dtype=paddle.float32

-        Details
-        ----------
+        Details:
            y = [[seq1],
                [seq2],
                [seq3]]

--- a/paddlespeech/t2s/modules/causal_conv.py
+++ b/paddlespeech/t2s/modules/causal_conv.py
@@ -41,14 +41,10 @@ class CausalConv1D(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T).
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+        Returns: 
+            Tensor: Output tensor (B, out_channels, T).
        """
        return self.conv(self.pad(x))[:, :, :x.shape[2]]

@@ -70,13 +66,9 @@ class CausalConv1DTranspose(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_channels, T_in).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T_out).
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T_in).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T_out).
        """
        return self.deconv(x)[:, :, :-self.stride]
--- a/paddlespeech/t2s/modules/conformer/convolution.py
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@@ -18,12 +18,10 @@ from paddle import nn

 class ConvolutionModule(nn.Layer):
    """ConvolutionModule in Conformer model.
-    Parameters
-    ----------
-    channels : int
-        The number of channels of conv layers.
-    kernel_size : int
-        Kernerl size of conv layers.
+
+    Args:
+        channels (int): The number of channels of conv layers.
+        kernel_size (int): Kernerl size of conv layers.
    """

    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
@@ -59,14 +57,11 @@ class ConvolutionModule(nn.Layer):

    def forward(self, x):
        """Compute convolution module.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (#batch, time, channels).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, channels).
+
+        Args:
+            x (Tensor): Input tensor (#batch, time, channels).
+        Returns:
+            Tensor: Output tensor (#batch, time, channels).
        """
        # exchange the temporal dimension and the feature dimension
        x = x.transpose([0, 2, 1])

--- a/paddlespeech/t2s/modules/conformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@@ -21,36 +21,27 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm

 class EncoderLayer(nn.Layer):
    """Encoder layer module.
-    Parameters
-    ----------
-    size : int
-        Input dimension.
-    self_attn : nn.Layer
-        Self-attention module instance.
+    
+    Args:
+        size (int): Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
            can be used as the argument.
-    feed_forward : nn.Layer
-        Feed-forward module instance.
+        feed_forward (nn.Layer): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
-    feed_forward_macaron : nn.Layer
-        Additional feed-forward module instance.
+        feed_forward_macaron (nn.Layer): Additional feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
-    conv_module : nn.Layer
-        Convolution module instance.
+        conv_module (nn.Layer): Convolution module instance.
            `ConvlutionModule` instance can be used as the argument.
-    dropout_rate : float
-        Dropout rate.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
-    stochastic_depth_rate : float
-        Proability to skip this layer.
+        stochastic_depth_rate (float): Proability to skip this layer.
            During training, the layer may skip residual computation and return input
            as-is with given probability.
    """
@@ -93,22 +84,17 @@ class EncoderLayer(nn.Layer):

    def forward(self, x_input, mask, cache=None):
        """Compute encoded features.
-        Parameters
-        ----------
-        x_input : Union[Tuple, paddle.Tensor]
-            Input tensor w/ or w/o pos emb.
+
+        Args:
+            x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb.
                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
                - w/o pos emb: Tensor (#batch, time, size).
-        mask : paddle.Tensor
-            Mask tensor for the input (#batch, time).
-        cache paddle.Tensor
-            Cache tensor of the input (#batch, time - 1, size).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, size).
-        paddle.Tensor
-            Mask tensor (#batch, time).
+            mask(Tensor): Mask tensor for the input (#batch, time).
+            cache (Tensor): 
+
+        Returns:
+            Tensor: Output tensor (#batch, time, size).
+            Tensor: Mask tensor (#batch, time).
        """
        if isinstance(x_input, tuple):
            x, pos_emb = x_input[0], x_input[1]

--- a/paddlespeech/t2s/modules/conv.py
+++ b/paddlespeech/t2s/modules/conv.py
@@ -41,24 +41,17 @@ class Conv1dCell(nn.Conv1D):
    Thus, these arguments are removed from the ``__init__`` method of this
    class.

-    Parameters
-    ----------
-    in_channels: int
-        The feature size of the input.
-    out_channels: int
-        The feature size of the output.
-    kernel_size: int or Tuple[int]
-        The size of the kernel.
-    dilation: int or Tuple[int]
-        The dilation of the convolution, by default 1
-    weight_attr: ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the convolution kernel, by default None.
-    bias_attr: ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the bias. If ``False``, this layer does not
-        have a bias, by default None.
-        
-    Examples
-    --------
+    Args:
+        in_channels (int): The feature size of the input.
+        out_channels (int): The feature size of the output.
+        kernel_size (int or Tuple[int]): The size of the kernel.
+        dilation (int or Tuple[int]): The dilation of the convolution, by default 1
+        weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel, 
+            by default None.
+        bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias. 
+            If ``False``, this layer does not have a bias, by default None.
+            
+    Examples: 
        >>> cell = Conv1dCell(3, 4, kernel_size=5)
        >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
        >>> outputs = []
@@ -103,13 +96,11 @@ class Conv1dCell(nn.Conv1D):
    def start_sequence(self):
        """Prepare the layer for a series of incremental forward.
        
-        Warnings
-        ---------
+        Warnings:
            This method should be called before a sequence of calls to
            ``add_input``.

-        Raises
-        ------
+        Raises:
            Exception
                If this method is called when the layer is in training mode.
        """
@@ -130,10 +121,9 @@ class Conv1dCell(nn.Conv1D):
    def initialize_buffer(self, x_t):
        """Initialize the buffer for the step input.

-        Parameters
-        ----------
-        x_t : Tensor [shape=(batch_size, in_channels)]
-            The step input.
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+            
        """
        batch_size, _ = x_t.shape
        self._buffer = paddle.zeros(
@@ -143,10 +133,9 @@ class Conv1dCell(nn.Conv1D):
    def update_buffer(self, x_t):
        """Shift the buffer by one step.

-        Parameters
-        ----------
-        x_t : Tensor [shape=(batch_size, in_channels)]
-            The step input.
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+            
        """
        self._buffer = paddle.concat(
            [self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1)
@@ -154,15 +143,12 @@ class Conv1dCell(nn.Conv1D):
    def add_input(self, x_t):
        """Add step input and compute step output.

-        Parameters
-        -----------
-        x_t : Tensor [shape=(batch_size, in_channels)]
-            The step input.
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+          
+        Returns: 
+            y_t (Tensor): The step output. shape=(batch_size, out_channels)

-        Returns
-        -------
-        y_t :Tensor [shape=(batch_size, out_channels)]
-            The step output.
        """
        batch_size = x_t.shape[0]
        if self.receptive_field > 1:
@@ -186,33 +172,26 @@ class Conv1dCell(nn.Conv1D):
 class Conv1dBatchNorm(nn.Layer):
    """A Conv1D Layer followed by a BatchNorm1D.

-    Parameters
-    ----------
-    in_channels : int
-        The feature size of the input.
-    out_channels : int
-        The feature size of the output.
-    kernel_size : int
-        The size of the convolution kernel.
-    stride : int, optional
-        The stride of the convolution, by default 1.
-    padding : int, str or Tuple[int], optional
+    Args:
+        in_channels (int): The feature size of the input.
+        out_channels (int): The feature size of the output.
+        kernel_size (int): The size of the convolution kernel.
+        stride (int, optional): The stride of the convolution, by default 1.
+        padding (int, str or Tuple[int], optional):
            The padding of the convolution.
            If int, a symmetrical padding is applied before convolution;
            If str, it should be "same" or "valid";
            If Tuple[int], its length should be 2, meaning
            ``(pad_before, pad_after)``, by default 0.
-    weight_attr : ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the convolution kernel, by default None.
-    bias_attr : ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the bias of the convolution, by default
-        None.
-    data_format : str ["NCL" or "NLC"], optional
-        The data layout of the input, by default "NCL"
-    momentum : float, optional
-        The momentum of the BatchNorm1D layer, by default 0.9
-    epsilon : [type], optional
-        The epsilon of the BatchNorm1D layer, by default 1e-05
+        weight_attr (ParamAttr, Initializer, str or bool, optional):
+            The parameter attribute of the convolution kernel,
+            by default None.
+        bias_attr (ParamAttr, Initializer, str or bool, optional):
+            The parameter attribute of the bias of the convolution,
+            by defaultNone.
+        data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL"
+        momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9
+        epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05
    """

    def __init__(self,
@@ -245,15 +224,14 @@ class Conv1dBatchNorm(nn.Layer):
    def forward(self, x):
        """Forward pass of the Conv1dBatchNorm layer.
        
-        Parameters
-        ----------
-        x : Tensor [shape=(B, C_in, T_in) or (B, T_in, C_in)]
-            The input tensor. Its data layout depends on ``data_format``.
+        Args:
+            x (Tensor): The input tensor. Its data layout depends on ``data_format``. 
+            shape=(B, C_in, T_in) or (B, T_in, C_in)
+    
+        Returns:
+            Tensor: The output tensor. 
+                shape=(B, C_out, T_out) or (B, T_out, C_out)
                
-        Returns
-        -------
-        Tensor [shape=(B, C_out, T_out) or (B, T_out, C_out)]
-            The output tensor. 
        """
        x = self.conv(x)
        x = self.bn(x)

--- a/paddlespeech/t2s/modules/geometry.py
+++ b/paddlespeech/t2s/modules/geometry.py
@@ -18,23 +18,17 @@ import paddle
 def shuffle_dim(x, axis, perm=None):
    """Permute input tensor along aixs given the permutation or randomly.
    
-    Parameters
-    ----------
-    x : Tensor
-        The input tensor.
-    axis : int
-        The axis to shuffle.
-    perm : List[int], ndarray, optional
+    Args:
+        x (Tensor): The input tensor.
+        axis (int): The axis to shuffle.
+        perm (List[int], ndarray, optional): 
            The order to reorder the tensor along the ``axis``-th dimension.
-        
            It is a permutation of ``[0, d)``, where d is the size of the
            ``axis``-th dimension of the input tensor. If not provided,
            a random permutation is used. Defaults to None.

-    Returns
-    ---------
-    Tensor
-        The shuffled tensor, which has the same shape as x does.
+    Returns:
+        Tensor: The shuffled tensor, which has the same shape as x does.
    """
    size = x.shape[axis]
    if perm is not None and len(perm) != size:

--- a/paddlespeech/t2s/modules/layer_norm.py
+++ b/paddlespeech/t2s/modules/layer_norm.py
@@ -18,13 +18,9 @@ from paddle import nn

 class LayerNorm(nn.LayerNorm):
    """Layer normalization module.
-
-    Parameters
-    ----------
-    nout : int
-        Output dim size.
-    dim : int
-        Dimension to be normalized.
+    Args:
+        nout (int): Output dim size.
+        dim (int): Dimension to be normalized.
    """

    def __init__(self, nout, dim=-1):
@@ -35,15 +31,11 @@ class LayerNorm(nn.LayerNorm):
    def forward(self, x):
        """Apply layer normalization.

-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor.
+        Args:
+            x (Tensor):Input tensor.

-        Returns
-        ----------
-        paddle.Tensor
-            Normalized tensor.
+        Returns: 
+            Tensor: Normalized tensor.
        """

        if self.dim == -1:

--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -118,16 +118,13 @@ def discretized_mix_logistic_loss(y_hat,
 def sample_from_discretized_mix_logistic(y, log_scale_min=None):
    """
    Sample from discretized mixture of logistic distributions
-    Parameters
-    ----------
-    y : Tensor 
-        (B, C, T)
-    log_scale_min : float
-        Log scale minimum value
-    Returns
-    ----------
-    Tensor
-        sample in range of [-1, 1].
+
+    Args:
+        y(Tensor): (B, C, T)
+        log_scale_min(float, optional):  (Default value = None)
+
+    Returns:
+        Tensor: sample in range of [-1, 1].
    """
    if log_scale_min is None:
        log_scale_min = float(np.log(1e-14))
@@ -181,14 +178,10 @@ class GuidedAttentionLoss(nn.Layer):
    def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
        """Initialize guided attention loss module.

-        Parameters
-        ----------
-        sigma : float, optional
-            Standard deviation to control how close attention to a diagonal.
-        alpha : float, optional
-            Scaling coefficient (lambda).
-        reset_always : bool, optional
-            Whether to always reset masks.
+        Args:
+            sigma (float, optional): Standard deviation to control how close attention to a diagonal.
+            alpha (float, optional): Scaling coefficient (lambda).
+            reset_always (bool, optional): Whether to always reset masks.

        """
        super().__init__()
@@ -205,19 +198,13 @@ class GuidedAttentionLoss(nn.Layer):
    def forward(self, att_ws, ilens, olens):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        att_ws : Tensor
-            Batch of attention weights (B, T_max_out, T_max_in).
-        ilens : Tensor(int64)
-            Batch of input lenghts (B,).
-        olens : Tensor(int64)
-            Batch of output lenghts (B,).
-
-        Returns
-        ----------
-        Tensor
-            Guided attention loss value.
+        Args:
+            att_ws(Tensor): Batch of attention weights (B, T_max_out, T_max_in).
+            ilens(Tensor(int64)): Batch of input lenghts (B,).
+            olens(Tensor(int64)): Batch of output lenghts (B,).
+
+        Returns:
+            Tensor: Guided attention loss value.

        """
        if self.guided_attn_masks is None:
@@ -282,20 +269,14 @@ class GuidedAttentionLoss(nn.Layer):
    def _make_masks(ilens, olens):
        """Make masks indicating non-padded part.

-        Parameters
-        ----------
-        ilens : Tensor(int64) or List
-            Batch of lengths (B,).
-        olens : Tensor(int64) or List
-            Batch of lengths (B,).
+        Args:
+            ilens(Tensor(int64) or List): Batch of lengths (B,).
+            olens(Tensor(int64) or List): Batch of lengths (B,).

-        Returns
-        ----------
-        Tensor
-            Mask tensor indicating non-padded part.
+        Returns:
+            Tensor: Mask tensor indicating non-padded part.

-        Examples
-        ----------
+        Examples:
            >>> ilens, olens = [5, 2], [8, 5]
            >>> _make_mask(ilens, olens)
            tensor([[[1, 1, 1, 1, 1],
@@ -330,34 +311,24 @@ class GuidedAttentionLoss(nn.Layer):
 class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
    """Guided attention loss function module for multi head attention.

-    Parameters
-    ----------
-    sigma : float, optional
-        Standard deviation to controlGuidedAttentionLoss
+    Args:
+        sigma (float, optional): Standard deviation to controlGuidedAttentionLoss
            how close attention to a diagonal.
-    alpha : float, optional
-        Scaling coefficient (lambda).
-    reset_always : bool, optional
-        Whether to always reset masks.
+        alpha (float, optional): Scaling coefficient (lambda).
+        reset_always (bool, optional): Whether to always reset masks.

    """

    def forward(self, att_ws, ilens, olens):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        att_ws : Tensor
-            Batch of multi head attention weights (B, H, T_max_out, T_max_in).
-        ilens : Tensor
-            Batch of input lenghts (B,).
-        olens : Tensor
-            Batch of output lenghts (B,).
-
-        Returns
-        ----------
-        Tensor
-            Guided attention loss value.
+        Args:
+            att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in).
+            ilens(Tensor): Batch of input lenghts (B,).
+            olens(Tensor): Batch of output lenghts (B,).
+
+        Returns:
+            Tensor: Guided attention loss value.

        """
        if self.guided_attn_masks is None:
@@ -382,14 +353,11 @@ class Tacotron2Loss(nn.Layer):
                 use_weighted_masking=False,
                 bce_pos_weight=20.0):
        """Initialize Tactoron2 loss module.
-        Parameters
-        ----------
-        use_masking : bool
-            Whether to apply masking for padded part in loss calculation.
-        use_weighted_masking : bool
-            Whether to apply weighted masking in loss calculation.
-        bce_pos_weight : float
-            Weight of positive sample of stop token.
+
+        Args:
+            use_masking (bool): Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): Whether to apply weighted masking in loss calculation.
+            bce_pos_weight (float): Weight of positive sample of stop token.
        """
        super().__init__()
        assert (use_masking != use_weighted_masking) or not use_masking
@@ -405,28 +373,19 @@ class Tacotron2Loss(nn.Layer):

    def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        after_outs : Tensor
-            Batch of outputs after postnets (B, Lmax, odim).
-        before_outs : Tensor
-            Batch of outputs before postnets (B, Lmax, odim).
-        logits : Tensor
-            Batch of stop logits (B, Lmax).
-        ys : Tensor
-            Batch of padded target features (B, Lmax, odim).
-        stop_labels : Tensor(int64)
-            Batch of the sequences of stop token labels (B, Lmax).
-        olens : Tensor(int64)
-            Batch of the lengths of each target (B,).
-        Returns
-        ----------
-        Tensor
-            L1 loss value.
-        Tensor
-            Mean square error loss value.
-        Tensor
-            Binary cross entropy loss value.
+
+        Args:
+            after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
+            before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
+            logits(Tensor): Batch of stop logits (B, Lmax).
+            ys(Tensor): Batch of padded target features (B, Lmax, odim).
+            stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax).
+            olens(Tensor(int64)): 
+
+        Returns:
+            Tensor: L1 loss value.
+            Tensor: Mean square error loss value.
+            Tensor: Binary cross entropy loss value.
        """
        # make mask and apply it
        if self.use_masking:
@@ -513,28 +472,20 @@ def stft(x,
         center=True,
         pad_mode='reflect'):
    """Perform STFT and convert to magnitude spectrogram.
-    Parameters
-    ----------
-    x : Tensor
-        Input signal tensor (B, T).
-    fft_size : int
-        FFT size.
-    hop_size : int
-        Hop size.
-    win_length : int
-        window : str, optional
-    window : str
-        Name of window function, see `scipy.signal.get_window` for more
+    Args:
+        x(Tensor): Input signal tensor (B, T).
+        fft_size(int): FFT size.
+        hop_size(int): Hop size.
+        win_length(int, optional): window : str, optional (Default value = None)
+        window(str, optional): Name of window function, see `scipy.signal.get_window` for more
            details. Defaults to "hann".
-    center : bool, optional
-        center (bool, optional): Whether to pad `x` to make that the
+        center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the
            :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
-    pad_mode : str, optional
-        Choose padding pattern when `center` is `True`.
-    Returns
-    ----------
-    Tensor:
-        Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+        pad_mode(str, optional, optional):  (Default value = 'reflect')
+        hop_length:  (Default value = None)
+
+    Returns:
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
    """
    # calculate window
    window = signal.get_window(window, win_length, fftbins=True)
@@ -564,16 +515,11 @@ class SpectralConvergenceLoss(nn.Layer):

    def forward(self, x_mag, y_mag):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor)
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
+        Args: 
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Spectral convergence loss value.
        """
        return paddle.norm(
            y_mag - x_mag, p="fro") / paddle.clip(
@@ -590,16 +536,11 @@ class LogSTFTMagnitudeLoss(nn.Layer):

    def forward(self, x_mag, y_mag):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Log STFT magnitude loss value.
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Log STFT magnitude loss value.
        """
        return F.l1_loss(
            paddle.log(paddle.clip(y_mag, min=self.epsilon)),
@@ -625,18 +566,12 @@ class STFTLoss(nn.Layer):

    def forward(self, x, y):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T).
-        y : Tensor
-            Groundtruth signal (B, T).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
-        Tensor
-            Log STFT magnitude loss value.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Spectral convergence loss value.
+            Tensor: Log STFT magnitude loss value.
        """
        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
                     self.window)
@@ -658,16 +593,11 @@ class MultiResolutionSTFTLoss(nn.Layer):
            win_lengths=[600, 1200, 240],
            window="hann", ):
        """Initialize Multi resolution STFT loss module.
-        Parameters
-        ----------
-        fft_sizes : list
-            List of FFT sizes.
-        hop_sizes : list
-            List of hop sizes.
-        win_lengths : list
-            List of window lengths.
-        window : str
-            Window function type.
+        Args:
+            fft_sizes (list): List of FFT sizes.
+            hop_sizes (list): List of hop sizes.
+            win_lengths (list): List of window lengths.
+            window (str): Window function type.
        """
        super().__init__()
        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
@@ -677,18 +607,13 @@ class MultiResolutionSTFTLoss(nn.Layer):

    def forward(self, x, y):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T) or (B, #subband, T).
-        y : Tensor
-            Groundtruth signal (B, T) or (B, #subband, T).
-        Returns
-        ----------
-        Tensor
-            Multi resolution spectral convergence loss value.
-        Tensor
-            Multi resolution log STFT magnitude loss value.
+        
+        Args:
+            x (Tensor): Predicted signal (B, T) or (B, #subband, T).
+            y (Tensor): Groundtruth signal (B, T) or (B, #subband, T).
+        Returns:
+            Tensor: Multi resolution spectral convergence loss value.
+            Tensor: Multi resolution log STFT magnitude loss value.
        """
        if len(x.shape) == 3:
            # (B, C, T) -> (B x C, T)
@@ -725,14 +650,10 @@ class GeneratorAdversarialLoss(nn.Layer):

    def forward(self, outputs):
        """Calcualate generator adversarial loss.
-        Parameters
-        ----------
-        outputs: Tensor or List
-        Discriminator outputs or list of discriminator outputs.
-        Returns
-        ----------
-        Tensor
-            Generator adversarial loss value.
+        Args:
+            outputs (Tensor or List): Discriminator outputs or list of discriminator outputs.
+        Returns:
+            Tensor: Generator adversarial loss value.
        """
        if isinstance(outputs, (tuple, list)):
            adv_loss = 0.0
@@ -772,20 +693,15 @@ class DiscriminatorAdversarialLoss(nn.Layer):

    def forward(self, outputs_hat, outputs):
        """Calcualate discriminator adversarial loss.
-        Parameters
-        ----------
-        outputs_hat : Tensor or list
-            Discriminator outputs or list of
+
+        Args:
+            outputs_hat (Tensor or list): Discriminator outputs or list of
                discriminator outputs calculated from generator outputs.
-        outputs : Tensor or list
-            Discriminator outputs or list of
+            outputs (Tensor or list): Discriminator outputs or list of
                discriminator outputs calculated from groundtruth.
-        Returns
-        ----------
-        Tensor
-            Discriminator real loss value.
-        Tensor
-            Discriminator fake loss value.
+        Returns:
+            Tensor: Discriminator real loss value.
+            Tensor: Discriminator fake loss value.
        """
        if isinstance(outputs, (tuple, list)):
            real_loss = 0.0
@@ -868,17 +784,13 @@ def ssim(img1, img2, window_size=11, size_average=True):
 def weighted_mean(input, weight):
    """Weighted mean. It can also be used as masked mean.

-    Parameters
-    -----------
-    input : Tensor 
-        The input tensor.
-    weight : Tensor
-        The weight tensor with broadcastable shape with the input.
+    Args:
+        input(Tensor): The input tensor.
+        weight(Tensor): The weight tensor with broadcastable shape with the input.
+
+    Returns:
+        Tensor: Weighted mean tensor with the same dtype as input. shape=(1,)
            
-    Returns
-    ----------
-    Tensor [shape=(1,)]
-        Weighted mean tensor with the same dtype as input.
    """
    weight = paddle.cast(weight, input.dtype)
    # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__
@@ -889,20 +801,15 @@ def weighted_mean(input, weight):
 def masked_l1_loss(prediction, target, mask):
    """Compute maksed L1 loss.

-    Parameters
-    ----------
-    prediction : Tensor
-        The prediction.
-    target : Tensor
-        The target. The shape should be broadcastable to ``prediction``.
-    mask : Tensor
-        The mask. The shape should be broadcatable to the broadcasted shape of
+    Args:
+        prediction(Tensor): The prediction.
+        target(Tensor): The target. The shape should be broadcastable to ``prediction``.
+        mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of
            ``prediction`` and ``target``.

-    Returns
-    -------
-    Tensor [shape=(1,)]
-        The masked L1 loss.
+    Returns:
+        Tensor: The masked L1 loss. shape=(1,)
+        
    """
    abs_error = F.l1_loss(prediction, target, reduction='none')
    loss = weighted_mean(abs_error, mask)
@@ -975,14 +882,11 @@ class MelSpectrogram(nn.Layer):

    def forward(self, x):
        """Calculate Mel-spectrogram.
-        Parameters
-        ----------
-        x : Tensor
-            Input waveform tensor (B, T) or (B, 1, T).
-        Returns
-        ----------
-        Tensor
-            Mel-spectrogram (B, #mels, #frames).
+        Args:
+        
+            x (Tensor): Input waveform tensor (B, T) or (B, 1, T).
+        Returns:
+            Tensor: Mel-spectrogram (B, #mels, #frames).
        """
        if len(x.shape) == 3:
            # (B, C, T) -> (B*C, T)
@@ -1047,16 +951,12 @@ class MelSpectrogramLoss(nn.Layer):

    def forward(self, y_hat, y):
        """Calculate Mel-spectrogram loss.
-        Parameters
-        ----------
-        y_hat : Tensor
-            Generated single tensor (B, 1, T).
-        y : Tensor
-            Groundtruth single tensor (B, 1, T).
-        Returns
-        ----------
-        Tensor
-            Mel-spectrogram loss value.
+        Args:
+            y_hat(Tensor): Generated single tensor (B, 1, T).
+            y(Tensor): Groundtruth single tensor (B, 1, T).
+
+        Returns:
+            Tensor: Mel-spectrogram loss value.
        """
        mel_hat = self.mel_spectrogram(y_hat)
        mel = self.mel_spectrogram(y)
@@ -1081,18 +981,14 @@ class FeatureMatchLoss(nn.Layer):

    def forward(self, feats_hat, feats):
        """Calcualate feature matching loss.
-        Parameters
-        ----------
-        feats_hat : list
-            List of list of discriminator outputs
+
+        Args:
+            feats_hat(list): List of list of discriminator outputs
                calcuated from generater outputs.
-        feats : list
-            List of list of discriminator outputs
-            calcuated from groundtruth.
-        Returns
-        ----------
-        Tensor
-            Feature matching loss value.
+            feats(list): List of list of discriminator outputs
+
+        Returns:
+            Tensor: Feature matching loss value.

        """
        feat_match_loss = 0.0

--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -20,20 +20,14 @@ from typeguard import check_argument_types
 def pad_list(xs, pad_value):
    """Perform padding for the list of tensors.

-    Parameters
-    ----------
-    xs : List[Tensor]
-        List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
-    pad_value : float)
-        Value for padding.
-
-    Returns
-    ----------
-    Tensor
-        Padded tensor (B, Tmax, `*`).
-
-    Examples
-    ----------
+    Args:
+        xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+
+    Examples:
        >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
        >>> x
        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
@@ -55,18 +49,13 @@ def pad_list(xs, pad_value):
 def make_pad_mask(lengths, length_dim=-1):
    """Make mask tensor containing indices of padded part.

-    Parameters
-    ----------
-    lengths : LongTensor
-            Batch of lengths (B,).
+    Args:
+        lengths (Tensor(int64)): Batch of lengths (B,).

-    Returns
-    ----------
-    Tensor(bool)
-        Mask tensor containing indices of padded part bool.
+    Returns: 
+        Tensor(bool): Mask tensor containing indices of padded part bool.

-    Examples
-    ----------
+    Examples:
        With only lengths.

        >>> lengths = [5, 3, 2]
@@ -91,24 +80,17 @@ def make_pad_mask(lengths, length_dim=-1):
 def make_non_pad_mask(lengths, length_dim=-1):
    """Make mask tensor containing indices of non-padded part.

-    Parameters
-    ----------
-    lengths : LongTensor or List
-            Batch of lengths (B,).
-    xs : Tensor, optional
-        The reference tensor.
+    Args:
+        lengths (Tensor(int64) or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
            If set, masks will be the same shape as this tensor.
-    length_dim : int, optional
-        Dimension indicator of the above tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
            See the example.

-    Returns
-    ----------
-    Tensor(bool)
-        mask tensor containing indices of padded part bool.
+    Returns:
+        Tensor(bool): mask tensor containing indices of padded part bool.

-    Examples
-    ----------
+    Examples: 
        With only lengths.

        >>> lengths = [5, 3, 2]
@@ -127,12 +109,9 @@ def initialize(model: nn.Layer, init: str):

    Custom initialization routines can be implemented into submodules

-    Parameters
-    ----------
-    model : nn.Layer
-        Target.
-    init : str
-        Method of initialization.
+    Args:
+        model (nn.Layer): Target.
+        init (str): Method of initialization.
    """
    assert check_argument_types()


--- a/paddlespeech/t2s/modules/pqmf.py
+++ b/paddlespeech/t2s/modules/pqmf.py
@@ -24,17 +24,13 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
    """Design prototype filter for PQMF.
    This method is based on `A Kaiser window approach for the design of prototype
    filters of cosine modulated filterbanks`_.
-    Parameters
-    ----------
-    taps : int
-        The number of filter taps.
-    cutoff_ratio : float
-        Cut-off frequency ratio.
-    beta : float
-        Beta coefficient for kaiser window.
-    Returns
-    ----------
-    ndarray
+
+    Args:
+        taps (int): The number of filter taps.
+        cutoff_ratio (float): Cut-off frequency ratio.
+        beta (float): Beta coefficient for kaiser window.
+    Returns:
+        ndarray:
            Impluse response of prototype filter (taps + 1,).
        .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
            https://ieeexplore.ieee.org/abstract/document/681427
@@ -68,16 +64,12 @@ class PQMF(nn.Layer):
        """Initilize PQMF module.
        The cutoff_ratio and beta parameters are optimized for #subbands = 4.
        See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
-        Parameters
-        ----------
-        subbands : int
-            The number of subbands.
-        taps : int
-            The number of filter taps.
-        cutoff_ratio : float
-            Cut-off frequency ratio.
-        beta : float
-            Beta coefficient for kaiser window.
+
+        Args:
+            subbands (int): The number of subbands.
+            taps (int): The number of filter taps.
+            cutoff_ratio (float): Cut-off frequency ratio.
+            beta (float): Beta coefficient for kaiser window.
        """
        super().__init__()

@@ -110,28 +102,20 @@ class PQMF(nn.Layer):

    def analysis(self, x):
        """Analysis with PQMF.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, 1, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, subbands, T // subbands).
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+        Returns:
+            Tensor: Output tensor (B, subbands, T // subbands).
        """
        x = F.conv1d(self.pad_fn(x), self.analysis_filter)
        return F.conv1d(x, self.updown_filter, stride=self.subbands)

    def synthesis(self, x):
        """Synthesis with PQMF.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, subbands, T // subbands).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, 1, T).
+        Args:
+            x (Tensor): Input tensor (B, subbands, T // subbands).
+        Returns:
+            Tensor: Output tensor (B, 1, T).
        """
        x = F.conv1d_transpose(
            x, self.updown_filter * self.subbands, stride=self.subbands)

--- a/paddlespeech/t2s/modules/predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py
@@ -49,20 +49,13 @@ class DurationPredictor(nn.Layer):
                 offset=1.0):
        """Initilize duration predictor module.

-        Parameters
-        ----------
-        idim : int
-            Input dimension.
-        n_layers : int, optional
-                Number of convolutional layers.
-        n_chans : int, optional
-            Number of channels of convolutional layers.
-        kernel_size : int, optional
-            Kernel size of convolutional layers.
-        dropout_rate : float, optional
-                Dropout rate.
-        offset : float, optional
-            Offset value to avoid nan in log domain.
+        Args:
+            idim (int):Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+            offset (float, optional): Offset value to avoid nan in log domain.

        """
        super().__init__()
@@ -105,35 +98,23 @@ class DurationPredictor(nn.Layer):

    def forward(self, xs, x_masks=None):
        """Calculate forward propagation.
+        Args:
+            xs(Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)

-        Parameters
-        ----------
-        xs : Tensor
-            Batch of input sequences (B, Tmax, idim).
-        x_masks : ByteTensor, optional
-            Batch of masks indicating padded part (B, Tmax).
-
-        Returns
-        ----------
-        Tensor
-            Batch of predicted durations in log domain (B, Tmax).
+        Returns:
+            Tensor: Batch of predicted durations in log domain (B, Tmax).
        """
        return self._forward(xs, x_masks, False)

    def inference(self, xs, x_masks=None):
        """Inference duration.
+        Args:
+            xs(Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)

-        Parameters
-        ----------
-        xs : Tensor
-            Batch of input sequences (B, Tmax, idim).
-        x_masks : Tensor(bool), optional
-            Batch of masks indicating padded part (B, Tmax).
-
-        Returns
-        ----------
-        Tensor
-            Batch of predicted durations in linear domain int64 (B, Tmax).
+        Returns:
+            Tensor: Batch of predicted durations in linear domain int64 (B, Tmax).
        """
        return self._forward(xs, x_masks, True)

@@ -147,13 +128,9 @@ class DurationPredictorLoss(nn.Layer):

    def __init__(self, offset=1.0, reduction="mean"):
        """Initilize duration predictor loss module.
-
-        Parameters
-        ----------
-        offset : float, optional
-            Offset value to avoid nan in log domain.
-        reduction : str
-            Reduction type in loss calculation.
+        Args:
+            offset (float, optional): Offset value to avoid nan in log domain.
+            reduction (str): Reduction type in loss calculation.
        """
        super().__init__()
        self.criterion = nn.MSELoss(reduction=reduction)
@@ -162,20 +139,14 @@ class DurationPredictorLoss(nn.Layer):
    def forward(self, outputs, targets):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        outputs : Tensor
-            Batch of prediction durations in log domain (B, T)
-        targets : Tensor
-            Batch of groundtruth durations in linear domain (B, T)
+        Args:
+            outputs(Tensor): Batch of prediction durations in log domain (B, T)
+            targets(Tensor): Batch of groundtruth durations in linear domain (B, T)

-        Returns
-        ----------
-        Tensor
-            Mean squared error loss value.
+        Returns: 
+            Tensor: Mean squared error loss value.

-        Note
-        ----------
+        Note: 
            `outputs` is in log domain but `targets` is in linear domain.
        """
        # NOTE: outputs is in log domain while targets in linear

--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@@ -35,10 +35,8 @@ class LengthRegulator(nn.Layer):
    def __init__(self, pad_value=0.0):
        """Initilize length regulator module.

-        Parameters
-        ----------
-        pad_value : float, optional
-            Value used for padding.
+        Args:
+            pad_value (float, optional): Value used for padding.

        """
        super().__init__()
@@ -90,19 +88,13 @@ class LengthRegulator(nn.Layer):
    def forward(self, xs, ds, alpha=1.0, is_inference=False):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        xs : Tensor
-            Batch of sequences of char or phoneme embeddings (B, Tmax, D).
-        ds : Tensor(int64)
-            Batch of durations of each frame (B, T).
-        alpha : float, optional
-            Alpha value to control speed of speech.
+        Args:
+            xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
+            ds (Tensor(int64)): Batch of durations of each frame (B, T).
+            alpha (float, optional): Alpha value to control speed of speech.

-        Returns
-        ----------
-        Tensor
-            replicated input tensor based on durations (B, T*, D).
+        Returns:
+            Tensor: replicated input tensor based on durations (B, T*, D).
        """

        if alpha != 1.0:

--- a/paddlespeech/t2s/modules/predictor/variance_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py
--- a/paddlespeech/t2s/modules/residual_block.py
+++ b/paddlespeech/t2s/modules/residual_block.py
--- a/paddlespeech/t2s/modules/residual_stack.py
+++ b/paddlespeech/t2s/modules/residual_stack.py
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
--- a/paddlespeech/t2s/modules/tacotron2/attentions.py
+++ b/paddlespeech/t2s/modules/tacotron2/attentions.py
--- a/paddlespeech/t2s/modules/tacotron2/decoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/decoder.py
--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
--- a/paddlespeech/t2s/modules/tade_res_block.py
+++ b/paddlespeech/t2s/modules/tade_res_block.py
--- a/paddlespeech/t2s/modules/transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
--- a/paddlespeech/t2s/modules/transformer/decoder.py
+++ b/paddlespeech/t2s/modules/transformer/decoder.py
--- a/paddlespeech/t2s/modules/transformer/decoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
--- a/paddlespeech/t2s/modules/transformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py
--- a/paddlespeech/t2s/modules/transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
--- a/paddlespeech/t2s/modules/transformer/mask.py
+++ b/paddlespeech/t2s/modules/transformer/mask.py
--- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
--- a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
+++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
@@ -20,14 +20,10 @@ from paddle import nn
 class PositionwiseFeedForward(nn.Layer):
    """Positionwise feed forward layer.

-    Parameters
-    ----------
-    idim : int
-        Input dimenstion.
-    hidden_units : int
-        The number of hidden units.
-    dropout_rate : float
-        Dropout rate.
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
    """

    def __init__(self,

--- a/paddlespeech/t2s/modules/transformer/repeat.py
+++ b/paddlespeech/t2s/modules/transformer/repeat.py
--- a/paddlespeech/t2s/modules/transformer/subsampling.py
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
--- a/paddlespeech/t2s/modules/upsample.py
+++ b/paddlespeech/t2s/modules/upsample.py
--- a/paddlespeech/t2s/training/experiment.py
+++ b/paddlespeech/t2s/training/experiment.py
--- a/paddlespeech/t2s/training/extensions/snapshot.py
+++ b/paddlespeech/t2s/training/extensions/snapshot.py
--- a/paddlespeech/t2s/utils/error_rate.py
+++ b/paddlespeech/t2s/utils/error_rate.py
--- a/paddlespeech/t2s/utils/h5_utils.py
+++ b/paddlespeech/t2s/utils/h5_utils.py