diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index 26f59dfb13f89c32d6aee782552b011d090b087d..855ceef96f5fced0bb2f1299bc011fe1fa663ec3 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -21,7 +21,6 @@ from paddle import nn from paddle.fluid import core from paddle.nn import functional as F -from paddlespeech.s2t.modules import initializer from paddlespeech.s2t.utils.log import Log #TODO(Hui Zhang): remove fluid import @@ -506,8 +505,3 @@ if not hasattr(paddle.nn, 'LayerDict'): logger.debug( "register user LayerDict to paddle.nn, remove this when fixed!") setattr(paddle.nn, 'LayerDict', LayerDict) - -""" - hack KaiminigUniform: change limit from np.sqrt(6.0 / float(fan_in)) to np.sqrt(1.0 / float(fan_in)) -""" -paddle.nn.initializer.KaimingUniform = initializer.KaimingUniform diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index d7bee6d7fe753554916d6b32e38756004507a49f..bcbc15d64ed2308eb197f5be13fff5567519d7a1 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -239,7 +239,7 @@ class U2Trainer(Trainer): n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1, - dist_sampler=False, + dist_sampler=True, shortest_first=False) self.valid_loader = BatchDataLoader( @@ -260,7 +260,7 @@ class U2Trainer(Trainer): n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1, - dist_sampler=False, + dist_sampler=True, shortest_first=False) logger.info("Setup train/valid Dataloader!") else: diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 67ec5924af8d99a95df64acd40d179be9bcc7885..e077cd5b7ccc058d37c2eeb47e53eb055b2596d0 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -41,7 +41,6 @@ from paddlespeech.s2t.modules.mask import make_pad_mask from paddlespeech.s2t.modules.mask import mask_finished_preds from paddlespeech.s2t.modules.mask import mask_finished_scores from paddlespeech.s2t.modules.mask import subsequent_mask -from paddlespeech.s2t.modules.nets_utils import initialize from paddlespeech.s2t.utils import checkpoint from paddlespeech.s2t.utils import layer_tools from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank @@ -51,6 +50,8 @@ from paddlespeech.s2t.utils.tensor_utils import pad_sequence from paddlespeech.s2t.utils.tensor_utils import th_accuracy from paddlespeech.s2t.utils.utility import log_add from paddlespeech.s2t.utils.utility import UpdateConfig +from paddlespeech.s2t.modules.initializer import DefaultInitializerContext +# from paddlespeech.s2t.modules.initializer import initialize __all__ = ["U2Model", "U2InferModel"] @@ -784,11 +785,8 @@ class U2Model(U2DecodeModel): def __init__(self, configs: dict): model_conf = configs.get('model_conf', dict()) init_type = model_conf.get("init_type", None) - if init_type is not None: - logger.info(f"Use {init_type} initializer as default initializer") - initialize(self, init_type) - vocab_size, encoder, decoder, ctc = U2Model._init_from_config(configs) - nn.initializer.set_global_initializer(None) + with DefaultInitializerContext(init_type): + vocab_size, encoder, decoder, ctc = U2Model._init_from_config(configs) super().__init__( vocab_size=vocab_size, diff --git a/paddlespeech/s2t/modules/activation.py b/paddlespeech/s2t/modules/activation.py index 4081f7f81a5ca9a0b8594ff01cff23ef6d3eac94..48c84fa634579f7483ec228434206b46147e6ba1 100644 --- a/paddlespeech/s2t/modules/activation.py +++ b/paddlespeech/s2t/modules/activation.py @@ -16,7 +16,8 @@ from collections import OrderedDict import paddle from paddle import nn from paddle.nn import functional as F - +from paddlespeech.s2t.modules.align import Linear +from paddlespeech.s2t.modules.align import Conv2D from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() @@ -51,7 +52,7 @@ class LinearGLUBlock(nn.Layer): idim (int): input and output dimension """ super().__init__() - self.fc = nn.Linear(idim, idim * 2) + self.fc = Linear(idim, idim * 2) def forward(self, xs): return glu(self.fc(xs), dim=-1) @@ -75,7 +76,7 @@ class ConvGLUBlock(nn.Layer): self.conv_residual = None if in_ch != out_ch: self.conv_residual = nn.utils.weight_norm( - nn.Conv2D( + Conv2D( in_channels=in_ch, out_channels=out_ch, kernel_size=(1, 1)), name='weight', dim=0) @@ -86,7 +87,7 @@ class ConvGLUBlock(nn.Layer): layers = OrderedDict() if bottlececk_dim == 0: layers['conv'] = nn.utils.weight_norm( - nn.Conv2D( + Conv2D( in_channels=in_ch, out_channels=out_ch * 2, kernel_size=(kernel_size, 1)), @@ -106,7 +107,7 @@ class ConvGLUBlock(nn.Layer): dim=0) layers['dropout_in'] = nn.Dropout(p=dropout) layers['conv_bottleneck'] = nn.utils.weight_norm( - nn.Conv2D( + Conv2D( in_channels=bottlececk_dim, out_channels=bottlececk_dim, kernel_size=(kernel_size, 1)), @@ -115,7 +116,7 @@ class ConvGLUBlock(nn.Layer): layers['dropout'] = nn.Dropout(p=dropout) layers['glu'] = GLU() layers['conv_out'] = nn.utils.weight_norm( - nn.Conv2D( + Conv2D( in_channels=bottlececk_dim, out_channels=out_ch * 2, kernel_size=(1, 1)), diff --git a/paddlespeech/s2t/modules/align.py b/paddlespeech/s2t/modules/align.py new file mode 100644 index 0000000000000000000000000000000000000000..575773d70b01423952e8f7aa57f55f1f670a2276 --- /dev/null +++ b/paddlespeech/s2t/modules/align.py @@ -0,0 +1,74 @@ +import paddle +from paddle import nn +from paddlespeech.s2t.modules.initializer import KaimingUniform + +""" + To align the initializer between paddle and torch, + the API below are set defalut initializer with priority higger than global initializer. +""" +global_init_type = None + + +class LayerNorm(nn.LayerNorm): + def __init__(self, normalized_shape, epsilon=1e-05, weight_attr=None, bias_attr=None, name=None): + if weight_attr is None: + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Constant(1.0)) + if bias_attr is None: + bias_attr = paddle.ParamAttr( + initializer=nn.initializer.Constant(0.0)) + super(LayerNorm, self).__init__(normalized_shape, epsilon, weight_attr, bias_attr, name) + +class BatchNorm1D(nn.BatchNorm1D): + def __init__(self, num_features, momentum=0.9, epsilon=1e-05, weight_attr=None, bias_attr=None, data_format='NCL', name=None): + if weight_attr is None: + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Constant(1.0)) + if bias_attr is None: + bias_attr = paddle.ParamAttr( + initializer=nn.initializer.Constant(0.0)) + super(BatchNorm1D, self).__init__(num_features, momentum, epsilon, weight_attr, bias_attr, data_format, name) + +class Embedding(nn.Embedding): + def __init__(self, num_embeddings, embedding_dim, padding_idx=None, sparse=False, weight_attr=None, name=None): + if weight_attr is None: + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Normal()) + super(Embedding, self).__init__(num_embeddings, embedding_dim, padding_idx, sparse, weight_attr, name) + +class Linear(nn.Linear): + def __init__(self, in_features, out_features, weight_attr=None, bias_attr=None, name=None): + if weight_attr is None: + if global_init_type == "kaiming_uniform": + weight_attr = paddle.ParamAttr( + initializer=KaimingUniform()) + if bias_attr is None: + if global_init_type == "kaiming_uniform": + bias_attr = paddle.ParamAttr( + initializer=KaimingUniform()) + super(Linear, self).__init__(in_features, out_features, weight_attr, bias_attr, name) + +class Conv1D(nn.Conv1D): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros', weight_attr=None, bias_attr=None, data_format='NCL'): + if weight_attr is None: + if global_init_type == "kaiming_uniform": + print("set kaiming_uniform") + weight_attr = paddle.ParamAttr( + initializer=KaimingUniform()) + if bias_attr is None: + if global_init_type == "kaiming_uniform": + bias_attr = paddle.ParamAttr( + initializer=KaimingUniform()) + super(Conv1D, self).__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, padding_mode, weight_attr, bias_attr, data_format) + +class Conv2D(nn.Conv2D): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros', weight_attr=None, bias_attr=None, data_format='NCHW'): + if weight_attr is None: + if global_init_type == "kaiming_uniform": + weight_attr = paddle.ParamAttr( + initializer=KaimingUniform()) + if bias_attr is None: + if global_init_type == "kaiming_uniform": + bias_attr = paddle.ParamAttr( + initializer=KaimingUniform()) + super(Conv2D, self).__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, padding_mode, weight_attr, bias_attr, data_format) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index c2f5e503eb483cc3a8ee25d3a17a7fd91c123b9d..438efd2a14151904cb75ff6c72f7be01663bff09 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -22,6 +22,7 @@ import paddle from paddle import nn from paddle.nn import initializer as I +from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() @@ -48,10 +49,10 @@ class MultiHeadedAttention(nn.Layer): # We assume d_v always equals d_k self.d_k = n_feat // n_head self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) + self.linear_q = Linear(n_feat, n_feat) + self.linear_k = Linear(n_feat, n_feat) + self.linear_v = Linear(n_feat, n_feat) + self.linear_out = Linear(n_feat, n_feat) self.dropout = nn.Dropout(p=dropout_rate) def forward_qkv(self, @@ -150,7 +151,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): """ super().__init__(n_head, n_feat, dropout_rate) # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False) + self.linear_pos = Linear(n_feat, n_feat, bias_attr=False) # these two learnable bias are used in matrix c and matrix d # as described in https://arxiv.org/abs/1901.02860 Section 3.3 #self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index 256d187c90779992ea67ddad12d83ab5ea309e9d..89e6526885a2679b8ab09a4e4e4423a15e51ac08 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -21,6 +21,9 @@ import paddle from paddle import nn from typeguard import check_argument_types +from paddlespeech.s2t.modules.align import BatchNorm1D +from paddlespeech.s2t.modules.align import Conv1D +from paddlespeech.s2t.modules.align import LayerNorm from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() @@ -49,7 +52,7 @@ class ConvolutionModule(nn.Layer): """ assert check_argument_types() super().__init__() - self.pointwise_conv1 = nn.Conv1D( + self.pointwise_conv1 = Conv1D( channels, 2 * channels, kernel_size=1, @@ -73,7 +76,7 @@ class ConvolutionModule(nn.Layer): padding = (kernel_size - 1) // 2 self.lorder = 0 - self.depthwise_conv = nn.Conv1D( + self.depthwise_conv = Conv1D( channels, channels, kernel_size, @@ -87,22 +90,12 @@ class ConvolutionModule(nn.Layer): assert norm in ['batch_norm', 'layer_norm'] if norm == "batch_norm": self.use_layer_norm = False - self.norm = nn.BatchNorm1D( - channels, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(1.0)), - bias_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(0.0))) + self.norm = BatchNorm1D(channels) else: self.use_layer_norm = True - self.norm = nn.LayerNorm( - channels, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(1.0)), - bias_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(0.0))) - - self.pointwise_conv2 = nn.Conv1D( + self.norm = LayerNorm(channels) + + self.pointwise_conv2 = Conv1D( channels, channels, kernel_size=1, diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py index 2094182af1a6d31068288d865654bace577b5975..33ad472defba0a86bc945582f386acb406e4c35e 100644 --- a/paddlespeech/s2t/modules/ctc.py +++ b/paddlespeech/s2t/modules/ctc.py @@ -18,6 +18,7 @@ from paddle import nn from paddle.nn import functional as F from typeguard import check_argument_types +from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.modules.loss import CTCLoss from paddlespeech.s2t.utils import ctc_utils from paddlespeech.s2t.utils.log import Log @@ -69,7 +70,7 @@ class CTCDecoderBase(nn.Layer): self.blank_id = blank_id self.odim = odim self.dropout = nn.Dropout(dropout_rate) - self.ctc_lo = nn.Linear(enc_n_units, self.odim) + self.ctc_lo = Linear(enc_n_units, self.odim) reduction_type = "sum" if reduction else "none" self.criterion = CTCLoss( blank=self.blank_id, diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index b0ae27e52e928e0fd20572fe94e2a68379a282bc..3a851ec62c35f633ce07fd0b4380d92b31d67b3b 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -24,6 +24,9 @@ from paddle import nn from typeguard import check_argument_types from paddlespeech.s2t.decoders.scorers.scorer_interface import BatchScorerInterface +from paddlespeech.s2t.modules.align import Embedding +from paddlespeech.s2t.modules.align import LayerNorm +from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.modules.attention import MultiHeadedAttention from paddlespeech.s2t.modules.decoder_layer import DecoderLayer from paddlespeech.s2t.modules.embedding import PositionalEncoding @@ -83,25 +86,15 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): if input_layer == "embed": self.embed = nn.Sequential( - nn.Embedding( - vocab_size, - attention_dim, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Normal())), + Embedding(vocab_size, attention_dim), PositionalEncoding(attention_dim, positional_dropout_rate), ) else: raise ValueError(f"only 'embed' is supported: {input_layer}") self.normalize_before = normalize_before - self.after_norm = nn.LayerNorm( - attention_dim, - epsilon=1e-12, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(1.0)), - bias_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(0.0))) + self.after_norm = LayerNorm(attention_dim, epsilon=1e-12) self.use_output_layer = use_output_layer - self.output_layer = nn.Linear(attention_dim, vocab_size) + self.output_layer = Linear(attention_dim, vocab_size) self.decoders = nn.LayerList([ DecoderLayer( diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py index 8eee5ceb1bcc14880feb8cc6f8739ebef1a76192..b7f8694c12623ce82eb6849bcd9438483f513502 100644 --- a/paddlespeech/s2t/modules/decoder_layer.py +++ b/paddlespeech/s2t/modules/decoder_layer.py @@ -20,6 +20,8 @@ from typing import Tuple import paddle from paddle import nn +from paddlespeech.s2t.modules.align import LayerNorm +from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() @@ -62,32 +64,14 @@ class DecoderLayer(nn.Layer): self.self_attn = self_attn self.src_attn = src_attn self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm( - size, - epsilon=1e-12, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(1.0)), - bias_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(0.0))) - self.norm2 = nn.LayerNorm( - size, - epsilon=1e-12, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(1.0)), - bias_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(0.0))) - self.norm3 = nn.LayerNorm( - size, - epsilon=1e-12, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(1.0)), - bias_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(0.0))) + self.norm1 = LayerNorm(size, epsilon=1e-12) + self.norm2 = LayerNorm(size, epsilon=1e-12) + self.norm3 = LayerNorm(size, epsilon=1e-12) self.dropout = nn.Dropout(dropout_rate) self.normalize_before = normalize_before self.concat_after = concat_after - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) + self.concat_linear1 = Linear(size + size, size) + self.concat_linear2 = Linear(size + size, size) def forward( self, diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 5f7b8e99dfc967b69f635b12e681dff63f1c8802..71a2bad40bf5efdb1c9a2d783db278919f9a8c6f 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -23,6 +23,8 @@ from paddle import nn from typeguard import check_argument_types from paddlespeech.s2t.modules.activation import get_activation +from paddlespeech.s2t.modules.align import LayerNorm +from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.modules.attention import MultiHeadedAttention from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule @@ -129,13 +131,7 @@ class BaseEncoder(nn.Layer): d_model=output_size, dropout_rate=positional_dropout_rate), ) self.normalize_before = normalize_before - self.after_norm = nn.LayerNorm( - output_size, - epsilon=1e-12, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(1.0)), - bias_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(0.0))) + self.after_norm = LayerNorm(output_size, epsilon=1e-12) self.static_chunk_size = static_chunk_size self.use_dynamic_chunk = use_dynamic_chunk self.use_dynamic_left_chunk = use_dynamic_left_chunk diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index 69a3f67bb4edd32d81cbdb953cc7e5e605a11b56..e80a298d621ac87db8ad9f76e48041f05ec18f64 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -20,6 +20,8 @@ from typing import Tuple import paddle from paddle import nn +from paddlespeech.s2t.modules.align import LayerNorm +from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() @@ -59,15 +61,15 @@ class TransformerEncoderLayer(nn.Layer): super().__init__() self.self_attn = self_attn self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, epsilon=1e-12) - self.norm2 = nn.LayerNorm(size, epsilon=1e-12) + self.norm1 = LayerNorm(size, epsilon=1e-12) + self.norm2 = LayerNorm(size, epsilon=1e-12) self.dropout = nn.Dropout(dropout_rate) self.size = size self.normalize_before = normalize_before self.concat_after = concat_after # concat_linear may be not used in forward fuction, # but will be saved in the *.pt - self.concat_linear = nn.Linear(size + size, size) + self.concat_linear = Linear(size + size, size) def forward( self, @@ -174,51 +176,23 @@ class ConformerEncoderLayer(nn.Layer): self.feed_forward = feed_forward self.feed_forward_macaron = feed_forward_macaron self.conv_module = conv_module - self.norm_ff = nn.LayerNorm( - size, - epsilon=1e-12, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(1.0)), - bias_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(0.0))) # for the FNN module - self.norm_mha = nn.LayerNorm( - size, - epsilon=1e-12, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(1.0)), - bias_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(0.0))) # for the MHA module + self.norm_ff = LayerNorm(size, epsilon=1e-12) # for the FNN module + self.norm_mha = LayerNorm(size, epsilon=1e-12) # for the MHA module if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm( - size, - epsilon=1e-12, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(1.0)), - bias_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(0.0))) + self.norm_ff_macaron = LayerNorm(size, epsilon=1e-12) self.ff_scale = 0.5 else: self.ff_scale = 1.0 if self.conv_module is not None: - self.norm_conv = nn.LayerNorm( - size, - epsilon=1e-12, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(1.0)), - bias_attr=paddle.ParamAttr(initializer=nn.initializer.Constant( - 0.0))) # for the CNN module - self.norm_final = nn.LayerNorm( - size, - epsilon=1e-12, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Constant(1.0)), - bias_attr=paddle.ParamAttr(initializer=nn.initializer.Constant( - 0.0))) # for the final output of the block + self.norm_conv = LayerNorm( + size, epsilon=1e-12) # for the CNN module + self.norm_final = LayerNorm( + size, epsilon=1e-12) # for the final output of the block self.dropout = nn.Dropout(dropout_rate) self.size = size self.normalize_before = normalize_before self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) + self.concat_linear = Linear(size + size, size) def forward( self, diff --git a/paddlespeech/s2t/modules/initializer.py b/paddlespeech/s2t/modules/initializer.py index c91ab231741581a10b32bb9d816d44b2d0ec918b..3fbab2853208948176a1b94865b6e6d02b124e1b 100644 --- a/paddlespeech/s2t/modules/initializer.py +++ b/paddlespeech/s2t/modules/initializer.py @@ -11,93 +11,35 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from __future__ import print_function - -from paddle.fluid import framework -from paddle.fluid.framework import in_dygraph_mode, default_main_program import numpy as np -from paddle.fluid.core import VarDesc +from paddle import nn +from paddle.fluid import framework from paddle.fluid import unique_name +from paddle.fluid.core import VarDesc +from paddle.fluid.framework import default_main_program +from paddle.fluid.framework import in_dygraph_mode +from paddle.fluid.initializer import Initializer +from paddle.fluid.initializer import MSRAInitializer +from typeguard import check_argument_types -__all__ = [ - 'MSRAInitializer' -] - - -class Initializer(object): - """Base class for variable initializers - - Defines the common interface of variable initializers. - They add operations to the init program that are used - to initialize variables. Users should not use this class - directly, but need to use one of its implementations. - """ - - def __init__(self): - pass - - def __call__(self, param, block=None): - """Add corresponding initialization operations to the network - """ - raise NotImplementedError() - - def _check_block(self, block): - if block is None: - block = default_main_program().global_block() - - return block - - def _compute_fans(self, var): - """Compute the fan_in and the fan_out for layers - - This method computes the fan_in and the fan_out - for neural network layers, if not specified. It is - not possible to perfectly estimate fan_in and fan_out. - This method will estimate it correctly for matrix multiply and - convolutions. - - Args: - var: variable for which fan_in and fan_out have to be computed - - Returns: - tuple of two integers (fan_in, fan_out) - """ - shape = var.shape - if not shape or len(shape) == 0: - fan_in = fan_out = 1 - elif len(shape) == 1: - fan_in = fan_out = shape[0] - elif len(shape) == 2: - # This is the case for simple matrix multiply - fan_in = shape[0] - fan_out = shape[1] - else: - # Assume this to be a convolutional kernel - # In PaddlePaddle, the shape of the kernel is like: - # [num_filters, num_filter_channels, ...] where the remaining - # dimensions are the filter_size - receptive_field_size = np.prod(shape[2:]) - fan_in = shape[1] * receptive_field_size - fan_out = shape[0] * receptive_field_size - - return (fan_in, fan_out) - +__all__ = ['KaimingUniform'] -class MSRAInitializer(Initializer): - r"""Implements the MSRA initializer a.k.a. Kaiming Initializer +class KaimingUniform(MSRAInitializer): + r"""Implements the Kaiming Uniform initializer This class implements the weight initialization from the paper `Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification `_ by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a robust initialization method that particularly considers the rectifier - nonlinearities. In case of Uniform distribution, the range is [-x, x], where + nonlinearities. + + In case of Uniform distribution, the range is [-x, x], where .. math:: - x = \sqrt{\\frac{6.0}{fan\_in}} + x = \sqrt{\frac{1.0}{fan\_in}} In case of Normal distribution, the mean is 0 and the standard deviation is @@ -107,10 +49,8 @@ class MSRAInitializer(Initializer): \sqrt{\\frac{2.0}{fan\_in}} Args: - uniform (bool): whether to use uniform or normal distribution - fan_in (float32|None): fan_in for MSRAInitializer. If None, it is\ + fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\ inferred from the variable. default is None. - seed (int32): random seed Note: It is recommended to set fan_in to None for most cases. @@ -119,23 +59,19 @@ class MSRAInitializer(Initializer): .. code-block:: python import paddle - import paddle.fluid as fluid - paddle.enable_static() - x = fluid.data(name="data", shape=[8, 32, 32], dtype="float32") - fc = fluid.layers.fc(input=x, size=10, - param_attr=fluid.initializer.MSRA(uniform=False)) + import paddle.nn as nn + + linear = nn.Linear(2, + 4, + weight_attr=nn.initializer.KaimingUniform()) + data = paddle.rand([30, 10, 2], dtype='float32') + res = linear(data) """ - def __init__(self, uniform=True, fan_in=None, seed=0): - """Constructor for MSRAInitializer - """ - assert uniform is not None - assert seed is not None - super(MSRAInitializer, self).__init__() - self._uniform = uniform - self._fan_in = fan_in - self._seed = seed + def __init__(self, fan_in=None): + super(KaimingUniform, self).__init__( + uniform=True, fan_in=fan_in, seed=0) def __call__(self, var, block=None): """Initialize the input tensor with MSRA initialization. @@ -165,8 +101,8 @@ class MSRAInitializer(Initializer): var.dtype == VarDesc.VarType.BF16 and not self._uniform): out_dtype = VarDesc.VarType.FP32 out_var = block.create_var( - name=unique_name.generate(".".join( - ['masra_init', var.name, 'tmp'])), + name=unique_name.generate( + ".".join(['masra_init', var.name, 'tmp'])), shape=var.shape, dtype=out_dtype, type=VarDesc.VarType.LOD_TENSOR, @@ -217,56 +153,23 @@ class MSRAInitializer(Initializer): var.op = op return op -class KaimingUniform(MSRAInitializer): - r"""Implements the Kaiming Uniform initializer - - This class implements the weight initialization from the paper - `Delving Deep into Rectifiers: Surpassing Human-Level Performance on - ImageNet Classification `_ - by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a - robust initialization method that particularly considers the rectifier - nonlinearities. - - In case of Uniform distribution, the range is [-x, x], where - - .. math:: - - x = \sqrt{\frac{6.0}{fan\_in}} - - Args: - fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\ - inferred from the variable. default is None. - - Note: - It is recommended to set fan_in to None for most cases. - - Examples: - .. code-block:: python - - import paddle - import paddle.nn as nn - - linear = nn.Linear(2, - 4, - weight_attr=nn.initializer.KaimingUniform()) - data = paddle.rand([30, 10, 2], dtype='float32') - res = linear(data) +class DefaultInitializerContext(object): """ - - def __init__(self, fan_in=None): - super(KaimingUniform, self).__init__( - uniform=True, fan_in=fan_in, seed=0) - + egs: + with DefaultInitializerContext("kaiming_uniform"): + code for setup_model + """ + def __init__(self, init_type=None): + self.init_type = init_type + + def __enter__(self): + from paddlespeech.s2t.modules import align + align.global_init_type = self.init_type + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + from paddlespeech.s2t.modules import align + align.global_init_type = None -# We short the class name, since users will use the initializer with the package -# name. The sample code: -# -# import paddle.fluid as fluid -# -# hidden = fluid.layers.fc(..., -# param_attr=ParamAttr(fluid.initializer.Xavier())) -# -# It is no need to add an `Initializer` as the class suffix -MSRA = MSRAInitializer diff --git a/paddlespeech/s2t/modules/nets_utils.py b/paddlespeech/s2t/modules/nets_utils.py deleted file mode 100644 index 10915c8c3ead76db6ae6a233fa7c9bc27fba7ff8..0000000000000000000000000000000000000000 --- a/paddlespeech/s2t/modules/nets_utils.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from espnet(https://github.com/espnet/espnet) -from paddle import nn -from typeguard import check_argument_types - -def initialize(model: nn.Layer, init: str): - """Initialize weights of a neural network module. - - Parameters are initialized using the given method or distribution. - - Custom initialization routines can be implemented into submodules - - Args: - model (nn.Layer): Target. - init (str): Method of initialization. - """ - assert check_argument_types() - - if init == "xavier_uniform": - nn.initializer.set_global_initializer(nn.initializer.XavierUniform(), - nn.initializer.Constant()) - elif init == "xavier_normal": - nn.initializer.set_global_initializer(nn.initializer.XavierNormal(), - nn.initializer.Constant()) - elif init == "kaiming_uniform": - nn.initializer.set_global_initializer(nn.initializer.KaimingUniform(), - nn.initializer.KaimingUniform()) - elif init == "kaiming_normal": - nn.initializer.set_global_initializer(nn.initializer.KaimingNormal(), - nn.initializer.Constant()) - else: - raise ValueError("Unknown initialization: " + init) diff --git a/paddlespeech/s2t/modules/positionwise_feed_forward.py b/paddlespeech/s2t/modules/positionwise_feed_forward.py index e2619cd49dc15ef7d9ddb1fbbb991f3fe3eb1c35..c2725dc5cc4aac28d04e44333e185082d7300d44 100644 --- a/paddlespeech/s2t/modules/positionwise_feed_forward.py +++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py @@ -17,6 +17,7 @@ import paddle from paddle import nn +from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() @@ -44,10 +45,10 @@ class PositionwiseFeedForward(nn.Layer): activation (paddle.nn.Layer): Activation function """ super().__init__() - self.w_1 = nn.Linear(idim, hidden_units) + self.w_1 = Linear(idim, hidden_units) self.activation = activation self.dropout = nn.Dropout(dropout_rate) - self.w_2 = nn.Linear(hidden_units, idim) + self.w_2 = Linear(hidden_units, idim) def forward(self, xs: paddle.Tensor) -> paddle.Tensor: """Forward function. diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py index 99a8300f246149e924fe741f53934259d404e4e8..88451ddd77f6f89f8597238ddb1236acaa1945d7 100644 --- a/paddlespeech/s2t/modules/subsampling.py +++ b/paddlespeech/s2t/modules/subsampling.py @@ -19,6 +19,9 @@ from typing import Tuple import paddle from paddle import nn +from paddlespeech.s2t.modules.align import Conv2D +from paddlespeech.s2t.modules.align import LayerNorm +from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.modules.embedding import PositionalEncoding from paddlespeech.s2t.utils.log import Log @@ -60,8 +63,8 @@ class LinearNoSubsampling(BaseSubsampling): """ super().__init__(pos_enc_class) self.out = nn.Sequential( - nn.Linear(idim, odim), - nn.LayerNorm(odim, epsilon=1e-12), + Linear(idim, odim), + LayerNorm(odim, epsilon=1e-12), nn.Dropout(dropout_rate), nn.ReLU(), ) self.right_context = 0 @@ -108,12 +111,12 @@ class Conv2dSubsampling4(Conv2dSubsampling): """ super().__init__(pos_enc_class) self.conv = nn.Sequential( - nn.Conv2D(1, odim, 3, 2), + Conv2D(1, odim, 3, 2), nn.ReLU(), - nn.Conv2D(odim, odim, 3, 2), + Conv2D(odim, odim, 3, 2), nn.ReLU(), ) self.out = nn.Sequential( - nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) + Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) self.subsampling_rate = 4 # The right context for every conv layer is computed by: # (kernel_size - 1) * frame_rate_of_this_layer @@ -160,13 +163,13 @@ class Conv2dSubsampling6(Conv2dSubsampling): """ super().__init__(pos_enc_class) self.conv = nn.Sequential( - nn.Conv2D(1, odim, 3, 2), + Conv2D(1, odim, 3, 2), nn.ReLU(), - nn.Conv2D(odim, odim, 5, 3), + Conv2D(odim, odim, 5, 3), nn.ReLU(), ) # O = (I - F + Pstart + Pend) // S + 1 # when Padding == 0, O = (I - F - S) // S - self.linear = nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim) + self.linear = Linear(odim * (((idim - 1) // 2 - 2) // 3), odim) # The right context for every conv layer is computed by: # (kernel_size - 1) * frame_rate_of_this_layer # 10 = (3 - 1) * 1 + (5 - 1) * 2 @@ -212,14 +215,14 @@ class Conv2dSubsampling8(Conv2dSubsampling): """ super().__init__(pos_enc_class) self.conv = nn.Sequential( - nn.Conv2D(1, odim, 3, 2), + Conv2D(1, odim, 3, 2), nn.ReLU(), - nn.Conv2D(odim, odim, 3, 2), + Conv2D(odim, odim, 3, 2), nn.ReLU(), - nn.Conv2D(odim, odim, 3, 2), + Conv2D(odim, odim, 3, 2), nn.ReLU(), ) - self.linear = nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), - odim) + self.linear = Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), + odim) self.subsampling_rate = 8 # The right context for every conv layer is computed by: # (kernel_size - 1) * frame_rate_of_this_layer