diff --git a/deepvoice3/compute_timestamp_ratio.py b/deepvoice3/compute_timestamp_ratio.py
index 7fc306ff98075d0b0245a45f7b0a5a093ac80485..d737059c1f447c905923749f19f5711d17041d29 100644
--- a/deepvoice3/compute_timestamp_ratio.py
+++ b/deepvoice3/compute_timestamp_ratio.py
@@ -14,7 +14,7 @@ from hparams import hparams, hparams_debug_string
from data.data import TextDataSource, MelSpecDataSource
from nnmnkwii.datasets import FileSourceDataset
from tqdm import trange
-from modules import frontend
+import g2p as frontend
def build_parser():
diff --git a/deepvoice3/data.py b/deepvoice3/data.py
index 197b1f37fe00170723e4596383f872b328bf9982..6c9c85aa7fbdc8386fb56866e55d787cbe5a1d6c 100644
--- a/deepvoice3/data.py
+++ b/deepvoice3/data.py
@@ -25,7 +25,7 @@ import random
# import global hyper parameters
from hparams import hparams
-from modules import frontend
+import g2p as frontend
import builder
_frontend = getattr(frontend, hparams.frontend)
diff --git a/deepvoice3/dry_run.py b/deepvoice3/dry_run.py
index 4c1366a40da9eafc8687b613212a5de0c3e319e8..4428e40ef34e655d8059f73f9a47edb0a89d5574 100644
--- a/deepvoice3/dry_run.py
+++ b/deepvoice3/dry_run.py
@@ -17,7 +17,7 @@ from paddle import fluid
import paddle.fluid.dygraph as dg
from hparams import hparams, hparams_debug_string
-from modules import frontend
+import g2p as frontend
from deepvoice3 import DeepVoiceTTS
diff --git a/deepvoice3/eval_model.py b/deepvoice3/eval_model.py
index 870fdd6196f98534e352f2bb53cceeef60092381..101d352e5fcd4c02a93502181ba75e2d14fe0d89 100644
--- a/deepvoice3/eval_model.py
+++ b/deepvoice3/eval_model.py
@@ -37,7 +37,7 @@ from tensorboardX import SummaryWriter
# import global hyper parameters
from hparams import hparams
-from modules import frontend
+import g2p as frontend
_frontend = getattr(frontend, hparams.frontend)
diff --git a/deepvoice3/synthesis.py b/deepvoice3/synthesis.py
index e043403f944963fef64202ccb42d15cfb1e8eaa5..e589c570a44b45d5aa7782ec66a9c53c432d8c4a 100644
--- a/deepvoice3/synthesis.py
+++ b/deepvoice3/synthesis.py
@@ -30,7 +30,7 @@ import paddle.fluid.dygraph as dg
sys.path.append("../")
import audio
-from modules import frontend
+import g2p as frontend
import dry_run
from hparams import hparams
diff --git a/deepvoice3/train.py b/deepvoice3/train.py
index f36ade4676e1c18ba5abe41fc37b9435c1bbd8ea..0f6859ca965b2699315ac6b4d2c69fc7c4b78d73 100644
--- a/deepvoice3/train.py
+++ b/deepvoice3/train.py
@@ -32,7 +32,7 @@ from data import (TextDataSource, MelSpecDataSource,
LinearSpecDataSource,
PartialyRandomizedSimilarTimeLengthSampler,
Dataset, make_loader, create_batch)
-from modules import frontend
+import g2p as frontend
from builder import deepvoice3, WindowRange
from dry_run import dry_run
from train_model import train_model
diff --git a/modules/__init__.py b/modules/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/modules/conv.py b/modules/conv.py
deleted file mode 100644
index 34149be142490b52e60544e075741e5392fdd261..0000000000000000000000000000000000000000
--- a/modules/conv.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import numpy as np
-
-import paddle
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-
-from weight_norm import Conv2D, Conv2DTranspose
-
-
-class Conv1D(dg.Layer):
- """
- A convolution 1D block implemented with Conv2D. Form simplicity and
- ensuring the output has the same length as the input, it does not allow
- stride > 1.
- """
-
- def __init__(self,
- name_scope,
- in_cahnnels,
- num_filters,
- filter_size=3,
- dilation=1,
- groups=None,
- causal=False,
- param_attr=None,
- bias_attr=None,
- use_cudnn=True,
- act=None,
- dtype="float32"):
- super(Conv1D, self).__init__(name_scope, dtype=dtype)
-
- if causal:
- padding = dilation * (filter_size - 1)
- else:
- padding = (dilation * (filter_size - 1)) // 2
-
- self.in_channels = in_cahnnels
- self.num_filters = num_filters
- self.filter_size = filter_size
- self.dilation = dilation
- self.causal = causal
- self.padding = padding
- self.act = act
-
- self.conv = Conv2D(
- self.full_name(),
- num_filters=num_filters,
- filter_size=(1, filter_size),
- stride=(1, 1),
- dilation=(1, dilation),
- padding=(0, padding),
- groups=groups,
- param_attr=param_attr,
- bias_attr=bias_attr,
- use_cudnn=use_cudnn,
- act=act,
- dtype=dtype)
-
- def forward(self, x):
- """
- Args:
- x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
- input channels.
-
- Returns:
- x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
- output channels (num_filters).
- """
- x = self.conv(x)
- if self.filter_size > 1:
- if self.causal:
- x = fluid.layers.slice(
- x, axes=[3], starts=[0], ends=[-self.padding])
- elif self.filter_size % 2 == 0:
- x = fluid.layers.slice(x, axes=[3], starts=[0], ends=[-1])
- return x
-
- def start_new_sequence(self):
- self.temp_weight = None
- self.input_buffer = None
-
- def add_input(self, x):
- """
- Adding input for a time step and compute an output for a time step.
-
- Args:
- x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
- input channels, and T = 1.
-
- Returns:
- out (Variable): Shape(B, C_out, 1, T), the outputs, where C_out
- means output channels (num_filters), and T = 1.
-
- """
- if self.temp_weight is None:
- self.temp_weight = self._reshaped_weight()
-
- window_size = 1 + (self.filter_size - 1) * self.dilation
- batch_size = x.shape[0]
- in_channels = x.shape[1]
-
- if self.filter_size > 1:
- if self.input_buffer is None:
- self.input_buffer = fluid.layers.fill_constant(
- [batch_size, in_channels, 1, window_size - 1],
- dtype=x.dtype,
- value=0.0)
- else:
- self.input_buffer = self.input_buffer[:, :, :, 1:]
- self.input_buffer = fluid.layers.concat(
- [self.input_buffer, x], axis=3)
- x = self.input_buffer
- if self.dilation > 1:
- if not hasattr(self, "indices"):
- self.indices = dg.to_variable(
- np.arange(0, window_size, self.dilation))
- tmp = fluid.layers.transpose(
- self.input_buffer, perm=[3, 1, 2, 0])
- tmp = fluid.layers.gather(tmp, index=self.indices)
- tmp = fluid.layers.transpose(tmp, perm=[3, 1, 2, 0])
- x = tmp
- inputs = fluid.layers.reshape(
- x, shape=[batch_size, in_channels * 1 * self.filter_size])
- out = fluid.layers.matmul(inputs, self.temp_weight, transpose_y=True)
- out = fluid.layers.elementwise_add(out, self.conv._bias_param, axis=-1)
- out = fluid.layers.reshape(out, out.shape + [1, 1])
- out = self._helper.append_activation(out, act=self.act)
- return out
-
- def _reshaped_weight(self):
- """
- Get the linearized weight of convolution filter, cause it is by nature
- a matmul weight. And because the model uses weight norm, compute the
- weight by weight_v * weight_g to make it faster.
-
- Returns:
- weight_matrix (Variable): Shape(C_out, C_in * 1 * kernel_size)
- """
- shape = self.conv._filter_param_v.shape
- matrix_shape = [shape[0], np.prod(shape[1:])]
- weight_matrix = fluid.layers.reshape(
- self.conv._filter_param_v, shape=matrix_shape)
- weight_matrix = fluid.layers.elementwise_mul(
- fluid.layers.l2_normalize(
- weight_matrix, axis=1),
- self.conv._filter_param_g,
- axis=0)
- return weight_matrix
-
-
-class Conv1DTranspose(dg.Layer):
- """
- A convolutional transpose 1D block implemented with convolutional transpose
- 2D. It does not ensure that the output is exactly expanded stride times in
- time dimension.
- """
-
- def __init__(self,
- name_scope,
- in_channels,
- num_filters,
- filter_size,
- padding=0,
- stride=1,
- dilation=1,
- groups=None,
- param_attr=None,
- bias_attr=None,
- use_cudnn=True,
- act=None,
- dtype="float32"):
- super(Conv1DTranspose, self).__init__(name_scope, dtype=dtype)
-
- self.in_channels = in_channels
- self.num_filters = num_filters
- self.filter_size = filter_size
- self.padding = padding
- self.stride = stride
- self.dilation = dilation
- self.groups = groups
-
- self.conv_transpose = Conv2DTranspose(
- self.full_name(),
- num_filters,
- filter_size=(1, filter_size),
- padding=(0, padding),
- stride=(1, stride),
- dilation=(1, dilation),
- groups=groups,
- param_attr=param_attr,
- bias_attr=bias_attr,
- use_cudnn=use_cudnn,
- act=act,
- dtype=dtype)
-
- def forward(self, x):
- """
- Argss:
- x (Variable): Shape(B, C_in, 1, T_in), where C_in means the input
- channels and T_in means the number of time steps of input.
-
- Returns:
- out (Variable): shape(B, C_out, 1, T_out), where C_out means the
- output channels and T_out means the number of time steps of
- input.
- """
- return self.conv_transpose(x)
diff --git a/modules/frontend/README.md b/modules/frontend/README.md
deleted file mode 100644
index af4513e747f223e42ebc10f05761f6f3146d8a27..0000000000000000000000000000000000000000
--- a/modules/frontend/README.md
+++ /dev/null
@@ -1 +0,0 @@
-This package is adapted from https://github.com/r9y9/deepvoice3_pytorch/tree/master/deepvoice3_pytorch/frontend, Copyright (c) 2017: Ryuichi Yamamoto, whose license applies.
diff --git a/modules/frontend/__init__.py b/modules/frontend/__init__.py
deleted file mode 100644
index f5f6c0ae9e19e140f70d454cfa4ca6ebdb324f22..0000000000000000000000000000000000000000
--- a/modules/frontend/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# coding: utf-8
-"""Text processing frontend
-
-All frontend module should have the following functions:
-
-- text_to_sequence(text, p)
-- sequence_to_text(sequence)
-
-and the property:
-
-- n_vocab
-
-"""
-from . import en
-
-# optinoal Japanese frontend
-try:
- from . import jp
-except ImportError:
- jp = None
-
-try:
- from . import ko
-except ImportError:
- ko = None
-
-# if you are going to use the frontend, you need to modify _characters in
-# symbol.py:
-# _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' + '¡¿ñáéíóúÁÉÍÓÚÑ'
-try:
- from . import es
-except ImportError:
- es = None
diff --git a/modules/frontend/en/__init__.py b/modules/frontend/en/__init__.py
deleted file mode 100644
index 58cf2f6415f010936c3b7eb2eceb027fe4e640fb..0000000000000000000000000000000000000000
--- a/modules/frontend/en/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# coding: utf-8
-from modules.frontend.text.symbols import symbols
-
-import nltk
-from random import random
-
-n_vocab = len(symbols)
-
-_arpabet = nltk.corpus.cmudict.dict()
-
-
-def _maybe_get_arpabet(word, p):
- try:
- phonemes = _arpabet[word][0]
- phonemes = " ".join(phonemes)
- except KeyError:
- return word
-
- return '{%s}' % phonemes if random() < p else word
-
-
-def mix_pronunciation(text, p):
- text = ' '.join(_maybe_get_arpabet(word, p) for word in text.split(' '))
- return text
-
-
-def text_to_sequence(text, p=0.0):
- if p >= 0:
- text = mix_pronunciation(text, p)
- from modules.frontend.text import text_to_sequence
- text = text_to_sequence(text, ["english_cleaners"])
- return text
-
-
-from modules.frontend.text import sequence_to_text
diff --git a/modules/frontend/es/__init__.py b/modules/frontend/es/__init__.py
deleted file mode 100644
index 24323e58a939ea5e4196755f23048a9b66d52fd2..0000000000000000000000000000000000000000
--- a/modules/frontend/es/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# coding: utf-8
-from deepvoice3_paddle.frontend.text.symbols import symbols
-
-import nltk
-from random import random
-
-n_vocab = len(symbols)
-
-
-def text_to_sequence(text, p=0.0):
- from deepvoice3_paddle.frontend.text import text_to_sequence
- text = text_to_sequence(text, ["basic_cleaners"])
- return text
-
-
-from deepvoice3_paddle.frontend.text import sequence_to_text
diff --git a/modules/frontend/jp/__init__.py b/modules/frontend/jp/__init__.py
deleted file mode 100644
index 36c7fd84e1b4e5b5c4f4116978d8a35123b91cb6..0000000000000000000000000000000000000000
--- a/modules/frontend/jp/__init__.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# coding: utf-8
-
-import MeCab
-import jaconv
-from random import random
-
-n_vocab = 0xffff
-
-_eos = 1
-_pad = 0
-_tagger = None
-
-
-def _yomi(mecab_result):
- tokens = []
- yomis = []
- for line in mecab_result.split("\n")[:-1]:
- s = line.split("\t")
- if len(s) == 1:
- break
- token, rest = s
- rest = rest.split(",")
- tokens.append(token)
- yomi = rest[7] if len(rest) > 7 else None
- yomi = None if yomi == "*" else yomi
- yomis.append(yomi)
-
- return tokens, yomis
-
-
-def _mix_pronunciation(tokens, yomis, p):
- return "".join(yomis[idx]
- if yomis[idx] is not None and random() < p else tokens[idx]
- for idx in range(len(tokens)))
-
-
-def mix_pronunciation(text, p):
- global _tagger
- if _tagger is None:
- _tagger = MeCab.Tagger("")
- tokens, yomis = _yomi(_tagger.parse(text))
- return _mix_pronunciation(tokens, yomis, p)
-
-
-def add_punctuation(text):
- last = text[-1]
- if last not in [".", ",", "、", "。", "!", "?", "!", "?"]:
- text = text + "。"
- return text
-
-
-def normalize_delimitor(text):
- text = text.replace(",", "、")
- text = text.replace(".", "。")
- text = text.replace(",", "、")
- text = text.replace(".", "。")
- return text
-
-
-def text_to_sequence(text, p=0.0):
- for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", "(", ")", "(", ")"]:
- text = text.replace(c, "")
- text = text.replace("!", "!")
- text = text.replace("?", "?")
-
- text = normalize_delimitor(text)
- text = jaconv.normalize(text)
- if p > 0:
- text = mix_pronunciation(text, p)
- text = jaconv.hira2kata(text)
- text = add_punctuation(text)
-
- return [ord(c) for c in text] + [_eos] # EOS
-
-
-def sequence_to_text(seq):
- return "".join(chr(n) for n in seq)
diff --git a/modules/frontend/ko/__init__.py b/modules/frontend/ko/__init__.py
deleted file mode 100644
index ccb8b5f1cb78f4561f4e557e78712d2c1c82f016..0000000000000000000000000000000000000000
--- a/modules/frontend/ko/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# coding: utf-8
-
-from random import random
-
-n_vocab = 0xffff
-
-_eos = 1
-_pad = 0
-_tagger = None
-
-
-def text_to_sequence(text, p=0.0):
- return [ord(c) for c in text] + [_eos] # EOS
-
-
-def sequence_to_text(seq):
- return "".join(chr(n) for n in seq)
diff --git a/modules/frontend/text/__init__.py b/modules/frontend/text/__init__.py
deleted file mode 100644
index 26244ce3d44929613f5ad52e8e3a3e0dd85f07f1..0000000000000000000000000000000000000000
--- a/modules/frontend/text/__init__.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import re
-from . import cleaners
-from .symbols import symbols
-
-# Mappings from symbol to numeric ID and vice versa:
-_symbol_to_id = {s: i for i, s in enumerate(symbols)}
-_id_to_symbol = {i: s for i, s in enumerate(symbols)}
-
-# Regular expression matching text enclosed in curly braces:
-_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
-
-
-def text_to_sequence(text, cleaner_names):
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-
- The text can optionally have ARPAbet sequences enclosed in curly braces embedded
- in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
-
- Args:
- text: string to convert to a sequence
- cleaner_names: names of the cleaner functions to run the text through
-
- Returns:
- List of integers corresponding to the symbols in the text
- '''
- sequence = []
-
- # Check for curly braces and treat their contents as ARPAbet:
- while len(text):
- m = _curly_re.match(text)
- if not m:
- sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
- break
- sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
- sequence += _arpabet_to_sequence(m.group(2))
- text = m.group(3)
-
- # Append EOS token
- sequence.append(_symbol_to_id['~'])
- return sequence
-
-
-def sequence_to_text(sequence):
- '''Converts a sequence of IDs back to a string'''
- result = ''
- for symbol_id in sequence:
- if symbol_id in _id_to_symbol:
- s = _id_to_symbol[symbol_id]
- # Enclose ARPAbet back in curly braces:
- if len(s) > 1 and s[0] == '@':
- s = '{%s}' % s[1:]
- result += s
- return result.replace('}{', ' ')
-
-
-def _clean_text(text, cleaner_names):
- for name in cleaner_names:
- cleaner = getattr(cleaners, name)
- if not cleaner:
- raise Exception('Unknown cleaner: %s' % name)
- text = cleaner(text)
- return text
-
-
-def _symbols_to_sequence(symbols):
- return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
-
-
-def _arpabet_to_sequence(text):
- return _symbols_to_sequence(['@' + s for s in text.split()])
-
-
-def _should_keep_symbol(s):
- return s in _symbol_to_id and s is not '_' and s is not '~'
diff --git a/modules/frontend/text/cleaners.py b/modules/frontend/text/cleaners.py
deleted file mode 100644
index e94226476968df18c5e28f4b705f9161f3555025..0000000000000000000000000000000000000000
--- a/modules/frontend/text/cleaners.py
+++ /dev/null
@@ -1,104 +0,0 @@
-'''
-Cleaners are transformations that run over the input text at both training and
-eval time.
-
-Cleaners can be selected by passing a comma-delimited list of cleaner names as
-the "cleaners" hyperparameter. Some cleaners are English-specific. You'll
-typically want to use:
- 1. "english_cleaners" for English text
- 2. "transliteration_cleaners" for non-English text that can be transliterated
- to ASCII using the Unidecode library (https://pypi.python.org/pypi/Unidecode)
- 3. "basic_cleaners" if you do not want to transliterate (in this case, you
- should also update the symbols in symbols.py to match your data).
-'''
-
-import re
-from unidecode import unidecode
-from .numbers import normalize_numbers
-
-# Regular expression matching whitespace:
-_whitespace_re = re.compile(r'\s+')
-
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
- for x in [
- ('mrs', 'misess'),
- ('mr', 'mister'),
- ('dr', 'doctor'),
- ('st', 'saint'),
- ('co', 'company'),
- ('jr', 'junior'),
- ('maj', 'major'),
- ('gen', 'general'),
- ('drs', 'doctors'),
- ('rev', 'reverend'),
- ('lt', 'lieutenant'),
- ('hon', 'honorable'),
- ('sgt', 'sergeant'),
- ('capt', 'captain'),
- ('esq', 'esquire'),
- ('ltd', 'limited'),
- ('col', 'colonel'),
- ('ft', 'fort'),
- ]]
-
-
-def expand_abbreviations(text):
- for regex, replacement in _abbreviations:
- text = re.sub(regex, replacement, text)
- return text
-
-
-def expand_numbers(text):
- return normalize_numbers(text)
-
-
-def lowercase(text):
- return text.lower()
-
-
-def collapse_whitespace(text):
- return re.sub(_whitespace_re, ' ', text)
-
-
-def convert_to_ascii(text):
- return unidecode(text)
-
-
-def add_punctuation(text):
- if len(text) == 0:
- return text
- if text[-1] not in '!,.:;?':
- text = text + '.' # without this decoder is confused when to output EOS
- return text
-
-
-def basic_cleaners(text):
- '''
- Basic pipeline that lowercases and collapses whitespace without
- transliteration.
- '''
- text = lowercase(text)
- text = collapse_whitespace(text)
- return text
-
-
-def transliteration_cleaners(text):
- '''Pipeline for non-English text that transliterates to ASCII.'''
- text = convert_to_ascii(text)
- text = lowercase(text)
- text = collapse_whitespace(text)
- return text
-
-
-def english_cleaners(text):
- '''
- Pipeline for English text, including number and abbreviation expansion.
- '''
- text = convert_to_ascii(text)
- text = add_punctuation(text)
- text = lowercase(text)
- text = expand_numbers(text)
- text = expand_abbreviations(text)
- text = collapse_whitespace(text)
- return text
diff --git a/modules/frontend/text/cmudict.py b/modules/frontend/text/cmudict.py
deleted file mode 100644
index 304592b66e246d6c578ad024e63548a00aa6a9b5..0000000000000000000000000000000000000000
--- a/modules/frontend/text/cmudict.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import re
-
-valid_symbols = [
- 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
- 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
- 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
- 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1',
- 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW',
- 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T',
- 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y',
- 'Z', 'ZH'
-]
-
-_valid_symbol_set = set(valid_symbols)
-
-
-class CMUDict:
- '''
- Thin wrapper around CMUDict data.
- http://www.speech.cs.cmu.edu/cgi-bin/cmudict
- '''
-
- def __init__(self, file_or_path, keep_ambiguous=True):
- if isinstance(file_or_path, str):
- with open(file_or_path, encoding='latin-1') as f:
- entries = _parse_cmudict(f)
- else:
- entries = _parse_cmudict(file_or_path)
- if not keep_ambiguous:
- entries = {
- word: pron
- for word, pron in entries.items() if len(pron) == 1
- }
- self._entries = entries
-
- def __len__(self):
- return len(self._entries)
-
- def lookup(self, word):
- '''Returns list of ARPAbet pronunciations of the given word.'''
- return self._entries.get(word.upper())
-
-
-_alt_re = re.compile(r'\([0-9]+\)')
-
-
-def _parse_cmudict(file):
- cmudict = {}
- for line in file:
- if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
- parts = line.split(' ')
- word = re.sub(_alt_re, '', parts[0])
- pronunciation = _get_pronunciation(parts[1])
- if pronunciation:
- if word in cmudict:
- cmudict[word].append(pronunciation)
- else:
- cmudict[word] = [pronunciation]
- return cmudict
-
-
-def _get_pronunciation(s):
- parts = s.strip().split(' ')
- for part in parts:
- if part not in _valid_symbol_set:
- return None
- return ' '.join(parts)
diff --git a/modules/frontend/text/numbers.py b/modules/frontend/text/numbers.py
deleted file mode 100644
index 24b58175dc1028994a0bcccfde6531b21a27cb72..0000000000000000000000000000000000000000
--- a/modules/frontend/text/numbers.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import inflect
-import re
-
-_inflect = inflect.engine()
-_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
-_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
-_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
-_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
-_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
-_number_re = re.compile(r'[0-9]+')
-
-
-def _remove_commas(m):
- return m.group(1).replace(',', '')
-
-
-def _expand_decimal_point(m):
- return m.group(1).replace('.', ' point ')
-
-
-def _expand_dollars(m):
- match = m.group(1)
- parts = match.split('.')
- if len(parts) > 2:
- return match + ' dollars' # Unexpected format
- dollars = int(parts[0]) if parts[0] else 0
- cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
- if dollars and cents:
- dollar_unit = 'dollar' if dollars == 1 else 'dollars'
- cent_unit = 'cent' if cents == 1 else 'cents'
- return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
- elif dollars:
- dollar_unit = 'dollar' if dollars == 1 else 'dollars'
- return '%s %s' % (dollars, dollar_unit)
- elif cents:
- cent_unit = 'cent' if cents == 1 else 'cents'
- return '%s %s' % (cents, cent_unit)
- else:
- return 'zero dollars'
-
-
-def _expand_ordinal(m):
- return _inflect.number_to_words(m.group(0))
-
-
-def _expand_number(m):
- num = int(m.group(0))
- if num > 1000 and num < 3000:
- if num == 2000:
- return 'two thousand'
- elif num > 2000 and num < 2010:
- return 'two thousand ' + _inflect.number_to_words(num % 100)
- elif num % 100 == 0:
- return _inflect.number_to_words(num // 100) + ' hundred'
- else:
- return _inflect.number_to_words(
- num, andword='', zero='oh', group=2).replace(', ', ' ')
- else:
- return _inflect.number_to_words(num, andword='')
-
-
-def normalize_numbers(text):
- text = re.sub(_comma_number_re, _remove_commas, text)
- text = re.sub(_pounds_re, r'\1 pounds', text)
- text = re.sub(_dollars_re, _expand_dollars, text)
- text = re.sub(_decimal_number_re, _expand_decimal_point, text)
- text = re.sub(_ordinal_re, _expand_ordinal, text)
- text = re.sub(_number_re, _expand_number, text)
- return text
diff --git a/modules/frontend/text/symbols.py b/modules/frontend/text/symbols.py
deleted file mode 100644
index c6fc28bcf2f894147492559fce79d5723e6e110b..0000000000000000000000000000000000000000
--- a/modules/frontend/text/symbols.py
+++ /dev/null
@@ -1,18 +0,0 @@
-'''
-Defines the set of symbols used in text input to the model.
-
-The default is a set of ASCII characters that works well for English or text
-that has been run through Unidecode. For other data, you can modify _characters.
-See TRAINING_DATA.md for details.
-'''
-from .cmudict import valid_symbols
-
-_pad = '_'
-_eos = '~'
-_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
-
-# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
-_arpabet = ['@' + s for s in valid_symbols]
-
-# Export all symbols:
-symbols = [_pad, _eos] + list(_characters) + _arpabet
diff --git a/modules/loss.py b/modules/loss.py
deleted file mode 100644
index 96bcd3ba96f84ff0c93b06f4290b50243cadc2f2..0000000000000000000000000000000000000000
--- a/modules/loss.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from numba import jit
-
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-
-
-def masked_mean(inputs, mask):
- """
- Args:
- inputs (Variable): Shape(B, C, 1, T), the input, where B means
- batch size, C means channels of input, T means timesteps of
- the input.
- mask (Variable): Shape(B, T), a mask.
- Returns:
- loss (Variable): Shape(1, ), masked mean.
- """
- channels = inputs.shape[1]
- reshaped_mask = fluid.layers.reshape(
- mask, shape=[mask.shape[0], 1, 1, mask.shape[-1]])
- expanded_mask = fluid.layers.expand(
- reshaped_mask, expand_times=[1, channels, 1, 1])
- expanded_mask.stop_gradient = True
-
- valid_cnt = fluid.layers.reduce_sum(expanded_mask)
- valid_cnt.stop_gradient = True
-
- masked_inputs = inputs * expanded_mask
- loss = fluid.layers.reduce_sum(masked_inputs) / valid_cnt
- return loss
-
-
-@jit(nopython=True)
-def guided_attention(N, max_N, T, max_T, g):
- W = np.zeros((max_N, max_T), dtype=np.float32)
- for n in range(N):
- for t in range(T):
- W[n, t] = 1 - np.exp(-(n / N - t / T)**2 / (2 * g * g))
- return W
-
-
-def guided_attentions(input_lengths, target_lengths, max_target_len, g=0.2):
- B = len(input_lengths)
- max_input_len = input_lengths.max()
- W = np.zeros((B, max_target_len, max_input_len), dtype=np.float32)
- for b in range(B):
- W[b] = guided_attention(input_lengths[b], max_input_len,
- target_lengths[b], max_target_len, g).T
- return W
-
-
-class TTSLoss(object):
- def __init__(self,
- masked_weight=0.0,
- priority_weight=0.0,
- binary_divergence_weight=0.0,
- guided_attention_sigma=0.2):
- self.masked_weight = masked_weight
- self.priority_weight = priority_weight
- self.binary_divergence_weight = binary_divergence_weight
- self.guided_attention_sigma = guided_attention_sigma
-
- def l1_loss(self, prediction, target, mask, priority_bin=None):
- abs_diff = fluid.layers.abs(prediction - target)
-
- # basic mask-weighted l1 loss
- w = self.masked_weight
- if w > 0 and mask is not None:
- base_l1_loss = w * masked_mean(abs_diff, mask) + (
- 1 - w) * fluid.layers.reduce_mean(abs_diff)
- else:
- base_l1_loss = fluid.layers.reduce_mean(abs_diff)
-
- if self.priority_weight > 0 and priority_bin is not None:
- # mask-weighted priority channels' l1-loss
- priority_abs_diff = fluid.layers.slice(
- abs_diff, axes=[1], starts=[0], ends=[priority_bin])
- if w > 0 and mask is not None:
- priority_loss = w * masked_mean(priority_abs_diff, mask) + (
- 1 - w) * fluid.layers.reduce_mean(priority_abs_diff)
- else:
- priority_loss = fluid.layers.reduce_mean(priority_abs_diff)
-
- # priority weighted sum
- p = self.priority_weight
- loss = p * priority_loss + (1 - p) * base_l1_loss
- else:
- loss = base_l1_loss
- return loss
-
- def binary_divergence(self, prediction, target, mask):
- flattened_prediction = fluid.layers.reshape(prediction, [-1, 1])
- flattened_target = fluid.layers.reshape(target, [-1, 1])
- flattened_loss = fluid.layers.log_loss(
- flattened_prediction, flattened_target, epsilon=1e-8)
- bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
-
- w = self.masked_weight
- if w > 0 and mask is not None:
- loss = w * masked_mean(bin_div, mask) + (
- 1 - w) * fluid.layers.reduce_mean(bin_div)
- else:
- loss = fluid.layers.reduce_mean(bin_div)
- return loss
-
- @staticmethod
- def done_loss(done_hat, done):
- flat_done_hat = fluid.layers.reshape(done_hat, [-1, 1])
- flat_done = fluid.layers.reshape(done, [-1, 1])
- loss = fluid.layers.log_loss(flat_done_hat, flat_done, epsilon=1e-8)
- loss = fluid.layers.reduce_mean(loss)
- return loss
-
- def attention_loss(self, predicted_attention, input_lengths,
- target_lengths):
- """
- Given valid encoder_lengths and decoder_lengths, compute a diagonal
- guide, and compute loss from the predicted attention and the guide.
-
- Args:
- predicted_attention (Variable): Shape(*, B, T_dec, T_enc), the
- alignment tensor, where B means batch size, T_dec means number
- of time steps of the decoder, T_enc means the number of time
- steps of the encoder, * means other possible dimensions.
- input_lengths (numpy.ndarray): Shape(B,), dtype:int64, valid lengths
- (time steps) of encoder outputs.
- target_lengths (numpy.ndarray): Shape(batch_size,), dtype:int64,
- valid lengths (time steps) of decoder outputs.
-
- Returns:
- loss (Variable): Shape(1, ) attention loss.
- """
- n_attention, batch_size, max_target_len, max_input_len = (
- predicted_attention.shape)
- soft_mask = guided_attentions(input_lengths, target_lengths,
- max_target_len,
- self.guided_attention_sigma)
- soft_mask_ = dg.to_variable(soft_mask)
- loss = fluid.layers.reduce_mean(predicted_attention * soft_mask_)
- return loss
diff --git a/modules/modules.py b/modules/modules.py
deleted file mode 100644
index 3ae95d78a09e58f158004862c3e44a16576b7c22..0000000000000000000000000000000000000000
--- a/modules/modules.py
+++ /dev/null
@@ -1,458 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-
-import numpy as np
-
-import conv
-import weight_norm as weight_norm
-
-
-def FC(name_scope,
- in_features,
- size,
- num_flatten_dims=1,
- dropout=0.0,
- epsilon=1e-30,
- act=None,
- is_test=False,
- dtype="float32"):
- """
- A special Linear Layer, when it is used with dropout, the weight is
- initialized as normal(0, std=np.sqrt((1-dropout) / in_features))
- """
-
- # stds
- if isinstance(in_features, int):
- in_features = [in_features]
- stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features]
- weight_inits = [
- fluid.initializer.NormalInitializer(scale=std) for std in stds
- ]
- bias_init = fluid.initializer.ConstantInitializer(0.0)
-
- # param attrs
- weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits]
- bias_attr = fluid.ParamAttr(initializer=bias_init)
-
- layer = weight_norm.FC(name_scope,
- size,
- num_flatten_dims=num_flatten_dims,
- param_attr=weight_attrs,
- bias_attr=bias_attr,
- act=act,
- dtype=dtype)
- return layer
-
-
-def Conv1D(name_scope,
- in_channels,
- num_filters,
- filter_size=3,
- dilation=1,
- groups=None,
- causal=False,
- std_mul=1.0,
- dropout=0.0,
- use_cudnn=True,
- act=None,
- dtype="float32"):
- """
- A special Conv1D Layer, when it is used with dropout, the weight is
- initialized as
- normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features)))
- """
- # std
- std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels))
- weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std)
- bias_init = fluid.initializer.ConstantInitializer(0.0)
-
- # param attrs
- weight_attr = fluid.ParamAttr(initializer=weight_init)
- bias_attr = fluid.ParamAttr(initializer=bias_init)
-
- layer = conv.Conv1D(
- name_scope,
- in_channels,
- num_filters,
- filter_size,
- dilation,
- groups=groups,
- causal=causal,
- param_attr=weight_attr,
- bias_attr=bias_attr,
- use_cudnn=use_cudnn,
- act=act,
- dtype=dtype)
- return layer
-
-
-def Embedding(name_scope,
- num_embeddings,
- embed_dim,
- is_sparse=False,
- is_distributed=False,
- padding_idx=None,
- std=0.01,
- dtype="float32"):
- # param attrs
- weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(
- scale=std))
- layer = dg.Embedding(
- name_scope, (num_embeddings, embed_dim),
- padding_idx=padding_idx,
- param_attr=weight_attr,
- dtype=dtype)
- return layer
-
-
-class Conv1DGLU(dg.Layer):
- """
- A Convolution 1D block with GLU activation. It also applys dropout for the
- input x. It fuses speaker embeddings through a FC activated by softsign. It
- has residual connection from the input x, and scale the output by
- np.sqrt(0.5).
- """
-
- def __init__(self,
- name_scope,
- n_speakers,
- speaker_dim,
- in_channels,
- num_filters,
- filter_size,
- dilation,
- std_mul=4.0,
- dropout=0.0,
- causal=False,
- residual=True,
- dtype="float32"):
- super(Conv1DGLU, self).__init__(name_scope, dtype=dtype)
-
- # conv spec
- self.in_channels = in_channels
- self.n_speakers = n_speakers
- self.speaker_dim = speaker_dim
- self.num_filters = num_filters
- self.filter_size = filter_size
- self.dilation = dilation
- self.causal = causal
- self.residual = residual
-
- # weight init and dropout
- self.std_mul = std_mul
- self.dropout = dropout
-
- if residual:
- assert (
- in_channels == num_filters
- ), "this block uses residual connection"\
- "the input_channes should equals num_filters"
-
- self.conv = Conv1D(
- self.full_name(),
- in_channels,
- 2 * num_filters,
- filter_size,
- dilation,
- causal=causal,
- std_mul=std_mul,
- dropout=dropout,
- dtype=dtype)
-
- if n_speakers > 1:
- assert (speaker_dim is not None
- ), "speaker embed should not be null in multi-speaker case"
- self.fc = Conv1D(
- self.full_name(),
- speaker_dim,
- num_filters,
- filter_size=1,
- dilation=1,
- causal=False,
- act="softsign",
- dtype=dtype)
-
- def forward(self, x, speaker_embed_bc1t=None):
- """
- Args:
- x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU
- layer, where B means batch_size, C_in means the input channels
- T means input time steps.
- speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded
- speaker embed, where C_sp means speaker embedding size. Note
- that when using residual connection, the Conv1DGLU does not
- change the number of channels, so out channels equals input
- channels.
-
- Returns:
- x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where
- C_out means the output channels of Conv1DGLU.
- """
-
- residual = x
- x = fluid.layers.dropout(
- x, self.dropout, dropout_implementation="upscale_in_train")
- x = self.conv(x)
-
- content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
-
- if speaker_embed_bc1t is not None:
- sp = self.fc(speaker_embed_bc1t)
- content = content + sp
-
- # glu
- x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
-
- if self.residual:
- x = fluid.layers.scale(x + residual, np.sqrt(0.5))
- return x
-
- def add_input(self, x, speaker_embed_bc11=None):
- """
- Inputs:
- x: shape(B, num_filters, 1, time_steps)
- speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps)
-
- Outputs:
- out: shape(B, num_filters, 1, time_steps), where time_steps = 1
- """
-
- residual = x
-
- # add step input and produce step output
- x = fluid.layers.dropout(
- x, self.dropout, dropout_implementation="upscale_in_train")
- x = self.conv.add_input(x)
-
- content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
-
- if speaker_embed_bc11 is not None:
- sp = self.fc(speaker_embed_bc11)
- content = content + sp
-
- x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
-
- if self.residual:
- x = fluid.layers.scale(x + residual, np.sqrt(0.5))
- return x
-
-
-def Conv1DTranspose(name_scope,
- in_channels,
- num_filters,
- filter_size,
- padding=0,
- stride=1,
- dilation=1,
- groups=None,
- std_mul=1.0,
- dropout=0.0,
- use_cudnn=True,
- act=None,
- dtype="float32"):
- std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size))
- weight_init = fluid.initializer.NormalInitializer(scale=std)
- weight_attr = fluid.ParamAttr(initializer=weight_init)
- bias_init = fluid.initializer.ConstantInitializer(0.0)
- bias_attr = fluid.ParamAttr(initializer=bias_init)
- layer = conv.Conv1DTranspose(
- name_scope,
- in_channels,
- num_filters,
- filter_size,
- padding=padding,
- stride=stride,
- dilation=dilation,
- groups=groups,
- param_attr=weight_attr,
- bias_attr=bias_attr,
- use_cudnn=use_cudnn,
- act=act,
- dtype=dtype)
- return layer
-
-
-def compute_position_embedding(rad):
- # rad is a transposed radius, shape(embed_dim, n_vocab)
- embed_dim, n_vocab = rad.shape
-
- even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32"))
- odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32"))
-
- even_rads = fluid.layers.gather(rad, even_dims)
- odd_rads = fluid.layers.gather(rad, odd_dims)
-
- sines = fluid.layers.sin(even_rads)
- cosines = fluid.layers.cos(odd_rads)
-
- temp = fluid.layers.scatter(rad, even_dims, sines)
- out = fluid.layers.scatter(temp, odd_dims, cosines)
- out = fluid.layers.transpose(out, perm=[1, 0])
- return out
-
-
-def position_encoding_init(n_position,
- d_pos_vec,
- position_rate=1.0,
- sinusoidal=True):
- """ Init the sinusoid position encoding table """
-
- # keep idx 0 for padding token position encoding zero vector
- position_enc = np.array([[
- position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec)
- for i in range(d_pos_vec)
- ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
-
- if sinusoidal:
- position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
- position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
-
- return position_enc
-
-
-class PositionEmbedding(dg.Layer):
- def __init__(self,
- name_scope,
- n_position,
- d_pos_vec,
- position_rate=1.0,
- is_sparse=False,
- is_distributed=False,
- param_attr=None,
- max_norm=None,
- padding_idx=None,
- dtype="float32"):
- super(PositionEmbedding, self).__init__(name_scope, dtype=dtype)
- self.embed = dg.Embedding(
- self.full_name(),
- size=(n_position, d_pos_vec),
- is_sparse=is_sparse,
- is_distributed=is_distributed,
- padding_idx=None,
- param_attr=param_attr,
- dtype=dtype)
- self.set_weight(
- position_encoding_init(
- n_position,
- d_pos_vec,
- position_rate=position_rate,
- sinusoidal=False).astype(dtype))
-
- self._is_sparse = is_sparse
- self._is_distributed = is_distributed
- self._remote_prefetch = self._is_sparse and (not self._is_distributed)
- if self._remote_prefetch:
- assert self._is_sparse is True and self._is_distributed is False
-
- self._padding_idx = (-1 if padding_idx is None else padding_idx if
- padding_idx >= 0 else (n_position + padding_idx))
- self._position_rate = position_rate
- self._max_norm = max_norm
- self._dtype = dtype
-
- def set_weight(self, array):
- assert self.embed._w.shape == list(array.shape), "shape does not match"
- self.embed._w._ivar.value().get_tensor().set(
- array, fluid.framework._current_expected_place())
-
- def forward(self, indices, speaker_position_rate=None):
- """
- Args:
- indices (Variable): Shape (B, T, 1), dtype: int64, position
- indices, where B means the batch size, T means the time steps.
- speaker_position_rate (Variable | float, optional), position
- rate. It can be a float point number or a Variable with
- shape (1,), then this speaker_position_rate is used for every
- example. It can also be a Variable with shape (B, 1), which
- contains a speaker position rate for each speaker.
- Returns:
- out (Variable): Shape(B, C_pos), position embedding, where C_pos
- means position embedding size.
- """
- rad = fluid.layers.transpose(self.embed._w, perm=[1, 0])
- batch_size = indices.shape[0]
-
- if speaker_position_rate is None:
- weight = compute_position_embedding(rad)
- out = self._helper.create_variable_for_type_inference(self._dtype)
- self._helper.append_op(
- type="lookup_table",
- inputs={"Ids": indices,
- "W": weight},
- outputs={"Out": out},
- attrs={
- "is_sparse": self._is_sparse,
- "is_distributed": self._is_distributed,
- "remote_prefetch": self._remote_prefetch,
- "padding_idx":
- self._padding_idx, # special value for lookup table op
- })
- return out
-
- elif (np.isscalar(speaker_position_rate) or
- isinstance(speaker_position_rate, fluid.framework.Variable) and
- speaker_position_rate.shape == [1, 1]):
- # # make a weight
- # scale the weight (the operand for sin & cos)
- if np.isscalar(speaker_position_rate):
- scaled_rad = fluid.layers.scale(rad, speaker_position_rate)
- else:
- scaled_rad = fluid.layers.elementwise_mul(
- rad, speaker_position_rate[0])
- weight = compute_position_embedding(scaled_rad)
- out = self._helper.create_variable_for_type_inference(self._dtype)
- self._helper.append_op(
- type="lookup_table",
- inputs={"Ids": indices,
- "W": weight},
- outputs={"Out": out},
- attrs={
- "is_sparse": self._is_sparse,
- "is_distributed": self._is_distributed,
- "remote_prefetch": self._remote_prefetch,
- "padding_idx":
- self._padding_idx, # special value for lookup table op
- })
- return out
-
- elif np.prod(speaker_position_rate.shape) > 1:
- assert speaker_position_rate.shape == [batch_size, 1]
- outputs = []
- for i in range(batch_size):
- rate = speaker_position_rate[i] # rate has shape [1]
- scaled_rad = fluid.layers.elementwise_mul(rad, rate)
- weight = compute_position_embedding(scaled_rad)
- out = self._helper.create_variable_for_type_inference(
- self._dtype)
- sequence = indices[i]
- self._helper.append_op(
- type="lookup_table",
- inputs={"Ids": sequence,
- "W": weight},
- outputs={"Out": out},
- attrs={
- "is_sparse": self._is_sparse,
- "is_distributed": self._is_distributed,
- "remote_prefetch": self._remote_prefetch,
- "padding_idx": -1,
- })
- outputs.append(out)
- out = fluid.layers.stack(outputs)
- return out
- else:
- raise Exception("Then you can just use position rate at init")
diff --git a/modules/weight_norm.py b/modules/weight_norm.py
deleted file mode 100644
index cbb0d03f9c50d697421d760d3c57ac95cfd63048..0000000000000000000000000000000000000000
--- a/modules/weight_norm.py
+++ /dev/null
@@ -1,863 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import numpy as np
-from six.moves import reduce
-
-from copy import deepcopy
-
-import paddle
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-from paddle.fluid import core
-from paddle.fluid.layers import utils
-from paddle.fluid.framework import Variable
-from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
-
-
-def _norm(p, dim):
- """Computes the norm over all dimensions except dim.
- It differs from pytorch implementation that it does not keep dim.
- This difference is related with the broadcast mechanism in paddle.
- Read elementeise_mul for more.
- """
-
- if dim is None:
- return np.linalg.norm(p, ord=2, axis=None)
- elif dim == 0:
- p = np.reshape(p, newshape=(p.shape[0], -1))
- return np.linalg.norm(p, ord=2, axis=1)
- elif dim == p.ndim - 1:
- p = np.reshape(p, newshape=(-1, p.shape[-1]))
- return np.linalg.norm(p, ord=2, axis=0)
- else:
- perm = list(range(p.ndim))
- perm[0] = dim
- perm[dim] = 0
- return _norm(np.transpose(p, axes=perm))
-
-
-class FC(dg.Layer):
- """
- **Fully Connected Layer**
-
- This function creates a fully connected layer in the network. It can take
- one or multiple tensors as its inputs(input can be a list of Variable, see
- Args in detail). It creates a pair of variables called (magnitude(g),
- direction(V)) for each input tensor. Elementwise_mul(V, g) represents a fully connected
- weight matrix from each input unit to each output unit.
- The fully connected layer multiplies each input tensor
- with its corresponding weight to produce an output Tensor with shape [M, `size`],
- where M is batch size. If multiple input tensors are given, the results of
- multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
- is not None, a bias variable will be created and added to the output.
- Finally, if activation is not None, it will be applied to the output as well.
-
- When the input is single tensor:
-
- .. math::
-
- Out = Act({X(normalize(V)g) + b})
-
- When the input are multiple tensors:
-
- .. math::
-
- Out = Act({\sum_{i=0}^{N-1}X_i(V_ig_i) + b})
-
- In the above equation:
-
- * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
- * :math:`X_i`: The i-th input tensor.
- * :math:`V_i`: The i-th direction matrix corresponding i-th input tensor.
- * :math:`g_i`: The i-th magnitude vector corresponding i-th input tensor.
- * :math:`b`: The bias parameter created by this layer (if needed).
- * :math:`Act`: The activation function.
- * :math:`Out`: The output tensor.
-
- See below for an example.
-
- .. code-block:: text
-
- Given:
- data_1.data = [[[0.1, 0.2],
- [0.3, 0.4]]]
- data_1.shape = (1, 2, 2) # 1 is batch_size
-
- data_2 = [[[0.1, 0.2, 0.3]]]
- data_2.shape = (1, 1, 3)
-
- out = fluid.layers.fc(input=[data_1, data_2], size=2)
-
- Then:
- out.data = [[0.18669507, 0.1893476]]
- out.shape = (1, 2)
-
- Args:
- name_scope(str): The name of this class.
- size(int): The number of output units in this layer.
- num_flatten_dims (int): The fc layer can accept an input tensor with more than
- two dimensions. If this happens, the multidimensional tensor will first be flattened
- into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
- tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
- dimensions will be flatten to form the first dimension of the final matrix (height of
- the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
- form the second dimension of the final matrix (width of the matrix). For example, suppose
- `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
- Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
- param_attr (ParamAttr|list of ParamAttr|None): The parameter attribute for learnable
- parameters/weights of this layer.
- bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
- of this layer. If it is set to False, no bias will be added to the output units.
- If it is set to None, the bias is initialized zero. Default: None.
- act (str|None): Activation to be applied to the output of this layer.
- is_test(bool): A flag indicating whether execution is in test phase. Default: False
- dtype(str): Dtype used for weight
-
- Raises:
- ValueError: If rank of the input tensor is less than 2.
-
- Examples:
- .. code-block:: python
-
- from paddle.fluid.dygraph.base import to_variable
- import paddle.fluid as fluid
- from paddle.fluid.dygraph import FC
- import numpy as np
-
- data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32')
- with fluid.dygraph.guard():
- fc = FC( "fc", 64, num_flatten_dims=2)
- data = to_variable( data )
- conv = fc( data )
-
- """
-
- def __init__(self,
- name_scope,
- size,
- num_flatten_dims=1,
- epsilon=1e-30,
- param_attr=None,
- bias_attr=None,
- act=None,
- is_test=False,
- dtype="float32"):
- super(FC, self).__init__(name_scope, dtype)
-
- self._size = size
- self._num_flatten_dims = num_flatten_dims
- self._epsilon = epsilon
- self._dtype = dtype
- self._param_attr = param_attr
- self._bias_attr = bias_attr
- self._act = act
- self.__g = list()
- self.__v = list()
-
- @property
- def _v(self, i=0):
- return self.__v[i]
-
- @property
- def _g(self, i=0):
- return self.__g[i]
-
- @_v.setter
- def _v(self, value, i=0):
- assert isinstance(value, Parameter)
- self.__v[i] = value
-
- @_g.setter
- def _g(self, value, i=0):
- assert isinstance(value, Parameter)
- self.__g[i] = value
-
- def _build_once(self, input):
- i = 0
- for inp, param in self._helper.iter_inputs_and_params(input,
- self._param_attr):
- input_shape = inp.shape
-
- param_shape = [
- reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
- 1)
- ] + [self._size]
- self.__v.append(
- self.add_parameter(
- "_v%d" % i,
- self.create_parameter(
- attr=param,
- shape=param_shape,
- dtype=self._dtype,
- is_bias=False)))
-
- magnitude_shape = param_shape[1:]
- magnitude_value = np.linalg.norm(self.__v[i].numpy(), ord=2, axis=0)
-
- self.__g.append(
- self.add_parameter(
- "_g%d" % i,
- self.create_parameter(
- attr=fluid.ParamAttr(
- initializer=fluid.initializer.NumpyArrayInitializer(
- magnitude_value)),
- shape=magnitude_shape,
- dtype=self._dtype,
- is_bias=False)))
- i += 1
-
- size = list([self._size])
- self._b = self.create_parameter(
- attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
-
- def forward(self, input):
- mul_results = list()
- i = 0
- for inp, param in self._helper.iter_inputs_and_params(input,
- self._param_attr):
- v_norm = self._helper.create_variable_for_type_inference(
- self._dtype)
- v_normalized = self._helper.create_variable_for_type_inference(
- self._dtype)
- self._helper.append_op(
- type="norm",
- inputs={"X": self.__v[i]},
- outputs={"Out": v_normalized,
- "Norm": v_norm},
- attrs={"axis": 0,
- "epsilon": self._epsilon})
- weight = self._helper.create_variable_for_type_inference(
- self._dtype)
- self._helper.append_op(
- type="elementwise_mul",
- inputs={"X": [v_normalized],
- "Y": [self.__g[i]]},
- outputs={"Out": [weight]},
- attrs={"axis": 1})
- tmp = self._helper.create_variable_for_type_inference(self._dtype)
- self._helper.append_op(
- type="mul",
- inputs={"X": inp,
- "Y": weight},
- outputs={"Out": tmp},
- attrs={
- "x_num_col_dims": self._num_flatten_dims,
- "y_num_col_dims": 1
- })
- i += 1
- mul_results.append(tmp)
-
- if len(mul_results) == 1:
- pre_bias = mul_results[0]
- else:
- pre_bias = self._helper.create_variable_for_type_inference(
- self._dtype)
- self._helper.append_op(
- type="sum",
- inputs={"X": mul_results},
- outputs={"Out": pre_bias},
- attrs={"use_mkldnn": False})
-
- if self._b:
- pre_activation = self._helper.create_variable_for_type_inference(
- dtype=self._dtype)
- self._helper.append_op(
- type="elementwise_add",
- inputs={"X": [pre_bias],
- "Y": [self._b]},
- outputs={"Out": [pre_activation]},
- attrs={"axis": self._num_flatten_dims})
- else:
- pre_activation = pre_bias
- # Currently, we don't support inplace in dygraph mode
- return self._helper.append_activation(pre_activation, act=self._act)
-
-
-class Conv2D(dg.Layer):
- """
- The convolution2D layer calculates the output based on the input, filter
- and strides, paddings, dilations, groups parameters. Input and
- Output are in NCHW format, where N is batch size, C is the number of
- channels, H is the height of the feature, and W is the width of the feature.
- Filter is in MCHW format, where M is the number of output image channels,
- C is the number of input image channels, H is the height of the filter,
- and W is the width of the filter. If the groups is greater than 1,
- C will equal the number of input image channels divided by the groups.
- Please refer to UFLDL's `convolution
- `
- for more detials.
- If bias attribution and activation type are provided, bias is added to the
- output of the convolution, and the corresponding activation function is
- applied to the final result.
-
- For each input :math:`X`, the equation is:
-
- .. math::
-
- Out = \sigma ((Vg) \\ast X + b)
-
- Where:
-
- * :math:`X`: Input value, a tensor with NCHW format.
- * :math:`V`: Filter direction value, a tensor with MCHW format.
- * :math:`g`: Filter magnitude value, a tensor with M format.
- * :math:`\\ast`: Convolution operation.
- * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
- * :math:`\\sigma`: Activation function.
- * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
- Example:
-
- - Input:
-
- Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
- Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
-
- - Output:
-
- Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
- Where
-
- .. math::
-
- H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
- W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
-
- Args:
- name_scope(str) : The name for this class.
- num_filters(int): The number of filter. It is as same as the output
- image channel.
- filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
- it must contain two integers, (filter_size_H, filter_size_W).
- Otherwise, the filter will be a square.
- stride (int|tuple): The stride size. If stride is a tuple, it must
- contain two integers, (stride_H, stride_W). Otherwise, the
- stride_H = stride_W = stride. Default: stride = 1.
- padding (int|tuple): The padding size. If padding is a tuple, it must
- contain two integers, (padding_H, padding_W). Otherwise, the
- padding_H = padding_W = padding. Default: padding = 0.
- dilation (int|tuple): The dilation size. If dilation is a tuple, it must
- contain two integers, (dilation_H, dilation_W). Otherwise, the
- dilation_H = dilation_W = dilation. Default: dilation = 1.
- groups (int): The groups number of the Conv2d Layer. According to grouped
- convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
- the first half of the filters is only connected to the first half
- of the input channels, while the second half of the filters is only
- connected to the second half of the input channels. Default: groups=1.
- param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
- of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
- will create ParamAttr as param_attr. If the Initializer of the param_attr
- is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
- and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
- bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
- If it is set to False, no bias will be added to the output units.
- If it is set to None or one attribute of ParamAttr, conv2d
- will create ParamAttr as bias_attr. If the Initializer of the bias_attr
- is not set, the bias is initialized zero. Default: None.
- use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
- library is installed. Default: True
- act (str): Activation type, if it is set to None, activation is not appended.
- Default: None
-
- Raises:
- ValueError: If the shapes of input, filter_size, stride, padding and
- groups mismatch.
-
- Examples:
- .. code-block:: python
-
- from paddle.fluid.dygraph.base import to_variable
- import paddle.fluid as fluid
- from paddle.fluid.dygraph import Conv2D
- import numpy as np
-
- data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32')
- with fluid.dygraph.guard():
- conv2d = Conv2D( "conv2d", 2, 3)
- data = to_variable( data )
- conv = conv2d( data )
-
- """
-
- def __init__(self,
- name_scope,
- num_filters,
- filter_size,
- stride=1,
- padding=0,
- dilation=1,
- groups=None,
- param_attr=None,
- bias_attr=None,
- use_cudnn=True,
- act=None,
- epsilon=1e-30,
- dtype="float32"):
- assert param_attr is not False, "param_attr should not be False here."
- super(Conv2D, self).__init__(name_scope, dtype)
- self._groups = groups
- self._stride = utils.convert_to_list(stride, 2, "stride")
- self._padding = utils.convert_to_list(padding, 2, "padding")
- self._dilation = utils.convert_to_list(dilation, 2, "dilation")
- self._act = act
- if not isinstance(use_cudnn, bool):
- raise ValueError("use_cudnn should be True or False")
- self._use_cudnn = use_cudnn
- self._filter_size = filter_size
- self._num_filters = num_filters
- self._param_attr = param_attr
- self._bias_attr = bias_attr
- self._epsilon = epsilon
- self._dtype = dtype
- # if (self._num_channels == self._groups and
- # num_filters % self._num_channels == 0 and not self._use_cudnn):
- # self._l_type = 'depthwise_conv2d'
- # else:
- # TODO(jiabin): recover the usage of depthwise_conv2d when it's
- # kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275
- self._l_type = "conv2d"
-
- def _build_once(self, input):
- self._num_channels = input.shape[1]
- if self._groups is None:
- num_filter_channels = self._num_channels
- else:
- if self._num_channels % self._groups != 0:
- raise ValueError("num_channels must be divisible by groups.")
- num_filter_channels = self._num_channels // self._groups
- filter_size = utils.convert_to_list(self._filter_size, 2, "filter_size")
- filter_shape = [self._num_filters, int(num_filter_channels)
- ] + filter_size
-
- def _get_default_param_initializer():
- filter_elem_num = filter_size[0] * filter_size[
- 1] * self._num_channels
- std = (2.0 / filter_elem_num)**0.5
- return Normal(0.0, std, 0)
-
- # weight_v
- self._filter_param_v = self.create_parameter(
- attr=self._param_attr,
- shape=filter_shape,
- dtype=self._dtype,
- default_initializer=_get_default_param_initializer())
-
- # weight_g
- norm_value = _norm(
- self._filter_param_v.numpy(), dim=0) # CAUTION: hard-code
- self._filter_param_g = self.create_parameter(
- attr=fluid.ParamAttr(
- initializer=fluid.initializer.NumpyArrayInitializer(
- norm_value)),
- shape=norm_value.shape,
- dtype=self._dtype,
- default_initializer=_get_default_param_initializer())
-
- if self._use_cudnn:
- self.create_variable(
- name="kCUDNNFwdAlgoCache",
- persistable=True,
- type=core.VarDesc.VarType.RAW)
- self.create_variable(
- name="kCUDNNBwdDataAlgoCache",
- persistable=True,
- type=core.VarDesc.VarType.RAW)
- self.create_variable(
- name="kCUDNNBwdFilterAlgoCache",
- persistable=True,
- type=core.VarDesc.VarType.RAW)
-
- self._bias_param = self.create_parameter(
- attr=self._bias_attr,
- shape=[self._num_filters],
- dtype=self._dtype,
- is_bias=True)
-
- def forward(self, input):
- matrix = self._helper.create_variable_for_type_inference(self._dtype)
- tmp = self._helper.create_variable_for_type_inference(self._dtype)
- new_shape = [
- self._filter_param_v.shape[0],
- reduce(lambda x, y: x * y, self._filter_param_v.shape[1:], 1),
- ]
-
- self._helper.append_op(
- type="reshape2",
- inputs={"X": self._filter_param_v},
- attrs={"shape": new_shape},
- outputs={"Out": matrix,
- "XShape": tmp})
-
- m_norm = self._helper.create_variable_for_type_inference(self._dtype)
- m_normalized = self._helper.create_variable_for_type_inference(
- self._dtype)
- self._helper.append_op(
- type="norm",
- inputs={"X": matrix},
- outputs={"Out": m_normalized,
- "Norm": m_norm},
- attrs={"axis": 1,
- "epsilon": self._epsilon})
-
- v_normalized = self._helper.create_variable_for_type_inference(
- self._dtype)
- tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
- self._helper.append_op(
- type="reshape2",
- inputs={"X": m_normalized},
- attrs={"shape": self._filter_param_v.shape},
- outputs={"Out": v_normalized,
- "XShape": tmp2})
-
- filter_param = self._helper.create_variable_for_type_inference(
- self._dtype)
- self._helper.append_op(
- type="elementwise_mul",
- inputs={"X": [v_normalized],
- "Y": [self._filter_param_g]},
- outputs={"Out": [filter_param]},
- attrs={"axis": 0}, # CAUTION: hard-code
- )
-
- pre_bias = self._helper.create_variable_for_type_inference(
- dtype=self._dtype)
-
- self._helper.append_op(
- type=self._l_type,
- inputs={"Input": input,
- "Filter": filter_param},
- outputs={"Output": pre_bias},
- attrs={
- "strides": self._stride,
- "paddings": self._padding,
- "dilations": self._dilation,
- "groups": self._groups if self._groups else 1,
- "use_cudnn": self._use_cudnn,
- "use_mkldnn": False,
- })
-
- if self._bias_param is not None:
- pre_act = self._helper.create_variable_for_type_inference(
- dtype=self._dtype)
- self._helper.append_op(
- type="elementwise_add",
- inputs={"X": [pre_bias],
- "Y": [self._bias_param]},
- outputs={"Out": [pre_act]},
- attrs={"axis": 1})
- else:
- pre_act = pre_bias
-
- # Currently, we don't support inplace in dygraph mode
- return self._helper.append_activation(pre_act, act=self._act)
-
-
-class Conv2DTranspose(dg.Layer):
- """
- **Convlution2D transpose layer**
-
- The convolution2D transpose layer calculates the output based on the input,
- filter, and dilations, strides, paddings. Input(Input) and output(Output)
- are in NCHW format. Where N is batch size, C is the number of channels,
- H is the height of the feature, and W is the width of the feature.
- Parameters(dilations, strides, paddings) are two elements. These two elements
- represent height and width, respectively. The details of convolution transpose
- layer, please refer to the following explanation and references
- `therein `_.
- If bias attribution and activation type are provided, bias is added to
- the output of the convolution, and the corresponding activation function
- is applied to the final result.
-
- For each input :math:`X`, the equation is:
-
- .. math::
-
- Out = \sigma ((Vg) \\ast X + b)
-
- Where:
-
- * :math:`X`: Input value, a tensor with NCHW format.
- * :math:`V`: Filter value, a tensor with MCHW format.
- * :math:`g`: Filter value, a tensor with M format.
- * :math:`\\ast`: Convolution operation.
- * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
- * :math:`\\sigma`: Activation function.
- * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
- Example:
-
- - Input:
-
- Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
- Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-
- - Output:
-
- Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
- Where
-
- .. math::
-
- H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
- W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
- H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
- W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
-
- Args:
- name_scope(str): The name of this class.
- num_filters(int): The number of the filter. It is as same as the output
- image channel.
- output_size(int|tuple|None): The output image size. If output size is a
- tuple, it must contain two integers, (image_H, image_W). None if use
- filter_size, padding, and stride to calculate output_size.
- if output_size and filter_size are specified at the same time, They
- should follow the formula above. Default: None.
- filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
- it must contain two integers, (filter_size_H, filter_size_W).
- Otherwise, the filter will be a square. None if use output size to
- calculate filter_size. Default: None.
- padding(int|tuple): The padding size. If padding is a tuple, it must
- contain two integers, (padding_H, padding_W). Otherwise, the
- padding_H = padding_W = padding. Default: padding = 0.
- stride(int|tuple): The stride size. If stride is a tuple, it must
- contain two integers, (stride_H, stride_W). Otherwise, the
- stride_H = stride_W = stride. Default: stride = 1.
- dilation(int|tuple): The dilation size. If dilation is a tuple, it must
- contain two integers, (dilation_H, dilation_W). Otherwise, the
- dilation_H = dilation_W = dilation. Default: dilation = 1.
- groups(int): The groups number of the Conv2d transpose layer. Inspired by
- grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
- when group=2, the first half of the filters is only connected to the
- first half of the input channels, while the second half of the
- filters is only connected to the second half of the input channels.
- Default: groups = 1.
- param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
- of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
- will create ParamAttr as param_attr. If the Initializer of the param_attr
- is not set, the parameter is initialized with Xavier. Default: None.
- bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
- If it is set to False, no bias will be added to the output units.
- If it is set to None or one attribute of ParamAttr, conv2d_transpose
- will create ParamAttr as bias_attr. If the Initializer of the bias_attr
- is not set, the bias is initialized zero. Default: None.
- use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
- library is installed. Default: True.
- act (str): Activation type, if it is set to None, activation is not appended.
- Default: None.
-
- Returns:
- Variable: The tensor variable storing the convolution transpose result.
-
- Raises:
- ValueError: If the shapes of input, filter_size, stride, padding and
- groups mismatch.
-
- Examples:
- .. code-block:: python
-
- import paddle.fluid as fluid
- import numpy
-
- with fluid.dygraph.guard():
- data = numpy.random.random((3, 32, 32)).astype('float32')
- conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose(
- 'Conv2DTranspose', num_filters=2, filter_size=3)
- ret = conv2DTranspose(fluid.dygraph.base.to_variable(data))
-
- """
-
- def __init__(self,
- name_scope,
- num_filters,
- output_size=None,
- filter_size=None,
- padding=0,
- stride=1,
- dilation=1,
- groups=None,
- param_attr=None,
- bias_attr=None,
- use_cudnn=True,
- epsilon=1e-30,
- act=None,
- dtype="float32"):
- super(Conv2DTranspose, self).__init__(name_scope, dtype)
- assert (param_attr is not False
- ), "param_attr should not be False in conv2d_transpose."
- self._param_attr = param_attr
- self._bias_attr = bias_attr
- self._groups = groups
- self._num_filters = num_filters
- self._use_cudnn = use_cudnn
- self._padding = padding
- self._stride = stride
- self._dilation = dilation
- self._filter_size = filter_size
- self._output_size = output_size
- self._op_type = "conv2d_transpose"
- self._epsilon = epsilon
-
- def _build_once(self, input):
- input_channel = input.shape[1]
- if (input_channel == self._groups and
- self._num_filters == input_channel and not self._use_cudnn):
- self._op_type = "depthwise_conv2d_transpose"
-
- if not isinstance(input, Variable):
- raise TypeError("Input of conv2d_transpose must be Variable")
-
- self._padding = utils.convert_to_list(self._padding, 2, "padding")
- self._stride = utils.convert_to_list(self._stride, 2, "stride")
- self._dilation = utils.convert_to_list(self._dilation, 2, "dilation")
-
- if not isinstance(self._use_cudnn, bool):
- raise ValueError("use_cudnn should be True or False")
-
- if self._filter_size is None:
- if self._output_size is None:
- raise ValueError(
- "output_size must be set when filter_size is None")
- if isinstance(self._output_size, int):
- self._output_size = [self._output_size, self._output_size]
-
- h_in = input.shape[2]
- w_in = input.shape[3]
-
- filter_size_h = (self._output_size[0] -
- (h_in - 1) * self._stride[0] + 2 * self._padding[0]
- - 1) // self._dilation[0] + 1
- filter_size_w = (self._output_size[1] -
- (w_in - 1) * self._stride[1] + 2 * self._padding[1]
- - 1) // self._dilation[1] + 1
- self._filter_size = [filter_size_h, filter_size_w]
- else:
- self._filter_size = utils.convert_to_list(
- self._filter_size, 2, "conv2d_transpose.filter_size")
-
- if self._output_size is None:
- self._output_size = []
- elif isinstance(self._output_size, list) or isinstance(
- self._output_size, int):
- self._output_size = utils.convert_to_list(self._output_size, 2,
- "output_size")
- else:
- raise ValueError("output_size should be list or int")
- self._padding = utils.convert_to_list(self._padding, 2, "padding")
- self._groups = 1 if self._groups is None else self._groups
- filter_shape = [
- input_channel,
- self._num_filters // self._groups,
- ] + self._filter_size
-
- # img filter v (direction)
- self._img_filter_v = self.create_parameter(
- dtype=input.dtype, shape=filter_shape, attr=self._param_attr)
-
- # img filter g (magnitude)
- img_filter_magnitude = _norm(
- self._img_filter_v.numpy(), dim=0) # CAUTION: hard-code
- self._img_filter_g = self.create_parameter(
- dtype=input.dtype,
- shape=img_filter_magnitude.shape,
- attr=fluid.ParamAttr(
- initializer=NumpyArrayInitializer(img_filter_magnitude)))
-
- self._img_bias = self.create_parameter(
- attr=self._bias_attr,
- shape=[self._num_filters],
- dtype=self._dtype,
- is_bias=True)
-
- def forward(self, input):
- matrix = self._helper.create_variable_for_type_inference(self._dtype)
- tmp = self._helper.create_variable_for_type_inference(self._dtype)
- new_shape = [
- self._img_filter_v.shape[0],
- reduce(lambda x, y: x * y, self._img_filter_v.shape[1:], 1),
- ]
-
- self._helper.append_op(
- type="reshape2",
- inputs={"X": self._img_filter_v},
- attrs={"shape": new_shape},
- outputs={"Out": matrix,
- "XShape": tmp})
-
- m_norm = self._helper.create_variable_for_type_inference(self._dtype)
- m_normalized = self._helper.create_variable_for_type_inference(
- self._dtype)
- self._helper.append_op(
- type="norm",
- inputs={"X": matrix},
- outputs={"Out": m_normalized,
- "Norm": m_norm},
- attrs={"axis": 1,
- "epsilon": self._epsilon})
-
- v_normalized = self._helper.create_variable_for_type_inference(
- self._dtype)
- tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
- self._helper.append_op(
- type="reshape2",
- inputs={"X": m_normalized},
- attrs={"shape": self._img_filter_v.shape},
- outputs={"Out": v_normalized,
- "XShape": tmp2})
-
- img_filter = self._helper.create_variable_for_type_inference(
- self._dtype)
- self._helper.append_op(
- type="elementwise_mul",
- inputs={"X": [v_normalized],
- "Y": [self._img_filter_g]},
- outputs={"Out": [img_filter]},
- attrs={"axis": 0}, # CAUTION: hard-code
- )
-
- pre_bias = self._helper.create_variable_for_type_inference(
- dtype=input.dtype)
- self._helper.append_op(
- type=self._op_type,
- inputs={"Input": [input],
- "Filter": [img_filter]},
- outputs={"Output": pre_bias},
- attrs={
- "output_size": self._output_size,
- "strides": self._stride,
- "paddings": self._padding,
- "dilations": self._dilation,
- "groups": self._groups,
- "use_cudnn": self._use_cudnn,
- })
-
- if self._img_bias is not None:
- pre_act = self._helper.create_variable_for_type_inference(
- dtype=self._dtype)
- self._helper.append_op(
- type="elementwise_add",
- inputs={"X": [pre_bias],
- "Y": [self._img_bias]},
- outputs={"Out": [pre_act]},
- attrs={"axis": 1})
- else:
- pre_act = pre_bias
-
- out = self._helper.append_activation(pre_act)
- return out