diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py index 012008913eee099fe4b5ee00a9c17a43626c1f23..9951594c2f5cb595756538cb069638bf4efc35d8 100644 --- a/python/paddle/distributed/sharding/group_sharded.py +++ b/python/paddle/distributed/sharding/group_sharded.py @@ -91,7 +91,7 @@ def group_sharded_parallel( # required: distributed import paddle - from paddle.fluid.dygraph.nn import Linear + from paddle.nn import Linear from paddle.distributed import fleet from paddle.distributed.sharding import group_sharded_parallel @@ -238,7 +238,7 @@ def save_group_sharded_model(model, output, optimizer=None): # required: distributed import paddle - from paddle.fluid.dygraph.nn import Linear + from paddle.nn import Linear from paddle.distributed import fleet from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py index ead2a89c372a1bafbbfcac54c44f93e136ee4c12..650a0afdeea8a84b282e51c7c7ebcb8766ced19b 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py @@ -23,7 +23,7 @@ from paddle.optimizer import Adam from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass from paddle.nn import Sequential -from paddle.fluid.dygraph import Linear +from paddle.nn import Linear from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose from paddle.fluid.log_helper import get_logger from paddle.fluid.framework import _test_eager_guard @@ -111,7 +111,7 @@ class ModelForConv2dT(nn.Layer): def __init__(self, num_classes=10): super().__init__() self.features = nn.Conv2DTranspose(4, 6, (3, 3)) - self.fc = Linear(input_dim=600, output_dim=num_classes) + self.fc = Linear(600, num_classes) def forward(self, inputs): x = self.features(inputs) @@ -143,11 +143,9 @@ class ImperativeLenet(paddle.nn.Layer): ) self.fc = Sequential( - Linear(input_dim=400, output_dim=120), - Linear(input_dim=120, output_dim=84), - Linear( - input_dim=84, output_dim=num_classes, act=classifier_activation - ), + Linear(400, 120), + Linear(120, 84), + Linear(84, num_classes), ) def forward(self, inputs): diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index faa7a434e30102133973b633f7fe09a05bdca4d9..a6019d86928da2a8246967abc450dc592e010def 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -821,11 +821,12 @@ class ReduceLROnPlateau(LearningRateDecay): .. code-block:: python import paddle.fluid as fluid + import paddle import numpy as np with fluid.dygraph.guard(): x = np.random.uniform(-1, 1, [10, 10]).astype("float32") - linear = fluid.dygraph.Linear(10, 10) + linear = paddle.nn.Linear(10, 10) input = fluid.dygraph.to_variable(x) reduce_lr = fluid.dygraph.ReduceLROnPlateau( @@ -842,7 +843,7 @@ class ReduceLROnPlateau(LearningRateDecay): total_loss = 0 for bath_id in range(5): out = linear(input) - loss = fluid.layers.reduce_mean(out) + loss = paddle.mean(out) total_loss += loss adam.minimize(loss) @@ -1090,9 +1091,10 @@ class StepDecay(_LearningRateEpochDecay): import paddle.fluid as fluid import numpy as np + import paddle with fluid.dygraph.guard(): x = np.random.uniform(-1, 1, [10, 10]).astype("float32") - linear = fluid.dygraph.Linear(10, 10) + linear = paddle.nn.Linear(10, 10) input = fluid.dygraph.to_variable(x) scheduler = fluid.dygraph.StepDecay(0.5, step_size=3) adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters()) @@ -1100,7 +1102,7 @@ class StepDecay(_LearningRateEpochDecay): for epoch in range(9): for batch_id in range(5): out = linear(input) - loss = fluid.layers.reduce_mean(out) + loss = paddle.mean(out) adam.minimize(loss) scheduler.epoch() @@ -1170,9 +1172,10 @@ class MultiStepDecay(_LearningRateEpochDecay): import paddle.fluid as fluid import numpy as np + import paddle with fluid.dygraph.guard(): x = np.random.uniform(-1, 1, [10, 10]).astype("float32") - linear = fluid.dygraph.Linear(10, 10) + linear = paddle.nn.Linear(10, 10) input = fluid.dygraph.to_variable(x) scheduler = fluid.dygraph.MultiStepDecay(0.5, milestones=[3, 5]) adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters()) @@ -1180,7 +1183,7 @@ class MultiStepDecay(_LearningRateEpochDecay): for epoch in range(6): for batch_id in range(5): out = linear(input) - loss = fluid.layers.reduce_mean(out) + loss = paddle.mean(out) adam.minimize(loss) scheduler.epoch() @@ -1255,9 +1258,10 @@ class LambdaDecay(_LearningRateEpochDecay): import paddle.fluid as fluid import numpy as np + import paddle with fluid.dygraph.guard(): x = np.random.uniform(-1, 1, [10, 10]).astype("float32") - linear = fluid.dygraph.Linear(10, 10) + linear = paddle.nn.Linear(10, 10) input = fluid.dygraph.to_variable(x) scheduler = fluid.dygraph.LambdaDecay(0.5, lr_lambda=lambda x: 0.95**x) adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters()) @@ -1265,7 +1269,7 @@ class LambdaDecay(_LearningRateEpochDecay): for epoch in range(6): for batch_id in range(5): out = linear(input) - loss = fluid.layers.reduce_mean(out) + loss = paddle.mean(out) adam.minimize(loss) scheduler.epoch() diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 8639f7294046f8b478e5616b7c5f4fa73a446633..f0b761fff82905f744acda32144c11ef0ccbcc1e 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -50,592 +50,11 @@ import paddle.utils.deprecated as deprecated from paddle import _C_ops, _legacy_C_ops __all__ = [ - 'Conv3D', - 'Linear', 'BatchNorm', 'Embedding', - 'Conv3DTranspose', ] -class Conv3D(layers.Layer): - r""" - **Convlution3D Layer** - - The convolution3D layer calculates the output based on the input, filter - and strides, paddings, dilations, groups parameters. Input(Input) and - Output(Output) are multidimensional tensors with a shape of - :math:`[N, C, D, H, W]` . Where N is batch size, C is the number of - channels, D is the depth of the feature, H is the height of the feature, - and W is the width of the feature. Convlution3D is similar with Convlution2D - but adds one dimension(depth). If bias attribution and activation type are - provided, bias is added to the output of the convolution, and the - corresponding activation function is applied to the final result. - - For each input :math:`X`, the equation is: - - .. math:: - - Out = \sigma (W \\ast X + b) - - In the above equation: - - * :math:`X`: Input value, a tensor with NCDHW or NDHWC format. - * :math:`W`: Filter value, a tensor with MCDHW format. - * :math:`\\ast`: Convolution operation. - * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. - * :math:`\\sigma`: Activation function. - * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. - - Example: - - - Input: - - Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` - - Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)` - - - Output: - Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` - - Where - - .. math:: - - D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\ - H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\ - W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1 - - Parameters: - num_channels(int): The number of channels in the input image. - num_filters(int): The number of filter. It is as same as the output image channel. - filter_size (int|tuple, optional): The filter size. If filter_size is a tuple, - it must contain three integers, (filter_size_D, filter_size_H, filter_size_W). - Otherwise, the filter will be a square, filter_size_depth = filter_size_height - = filter_size_width = filter_size. - stride (int|tuple, optional): The stride size. If stride is a tuple, it must - contain three integers, (stride_D, stride_H, stride_W). Otherwise, the - stride_D = stride_H = stride_W = stride. The default value is 1. - padding (int|tuple, optional): The padding size. If padding is a tuple, it must - contain three integers, (padding_D, padding_H, padding_W). Otherwise, the - padding_D = padding_H = padding_W = padding. The default value is 0. - dilation (int|tuple, optional): The dilation size. If dilation is a tuple, it must - contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the - dilation_D = dilation_H = dilation_W = dilation. The default value is 1. - groups (int, optional): The groups number of the Conv3D Layer. According to grouped - convolution in Alex Krizhevsky's Deep CNN paper: when group=2, - the first half of the filters is only connected to the first half - of the input channels, while the second half of the filters is only - connected to the second half of the input channels. The default value is 1. - param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights - of conv3d. If it is set to None or one attribute of ParamAttr, conv3d - will create ParamAttr as param_attr. If it is set to None, the parameter - is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is - :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None. - bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d. - If it is set to False, no bias will be added to the output units. - If it is set to None or one attribute of ParamAttr, conv3d - will create ParamAttr as bias_attr. If the Initializer of the bias_attr - is not set, the bias is initialized zero. The default value is None. - use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. The default value is True. - act (str, optional): Activation type, if it is set to None, activation is not appended. - The default value is None. - dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32". - - Attribute: - **weight** (Parameter): the learnable weights of filters of this layer. - - **bias** (Parameter): the learnable bias of this layer. - - Returns: - None. - - Raises: - ValueError: If the shapes of input, filter_size, stride, padding and - groups mismatch. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy - - with fluid.dygraph.guard(): - data = numpy.random.random((5, 3, 12, 32, 32)).astype('float32') - conv3d = fluid.dygraph.nn.Conv3D( - num_channels=3, num_filters=2, filter_size=3, act="relu") - ret = conv3d(fluid.dygraph.base.to_variable(data)) - - """ - - def __init__( - self, - num_channels, - num_filters, - filter_size, - stride=1, - padding=0, - dilation=1, - groups=None, - param_attr=None, - bias_attr=None, - use_cudnn=True, - act=None, - dtype='float32', - ): - assert param_attr is not False, "param_attr should not be False here." - super().__init__() - self._num_channels = num_channels - self._groups = groups - self._stride = utils.convert_to_list(stride, 3, 'stride') - self._padding = utils.convert_to_list(padding, 3, 'padding') - self._dilation = utils.convert_to_list(dilation, 3, 'dilation') - self._act = act - self._use_cudnn = use_cudnn - self._filter_size = filter_size - self._num_filters = num_filters - self._param_attr = param_attr - self._bias_attr = bias_attr - self._dtype = dtype - - if self._groups is None: - num_filter_channels = self._num_channels - else: - if self._num_channels % self._groups != 0: - raise ValueError("num_channels must be divisible by groups.") - num_filter_channels = self._num_channels // self._groups - - filter_size = utils.convert_to_list(self._filter_size, 3, 'filter_size') - filter_shape = [self._num_filters, num_filter_channels] + filter_size - - def _get_default_param_initializer(): - filter_elem_num = ( - filter_size[0] - * filter_size[1] - * filter_size[2] - * self._num_channels - ) - std = (2.0 / filter_elem_num) ** 0.5 - return Normal(0.0, std, 0) - - self.weight = self.create_parameter( - attr=self._param_attr, - shape=filter_shape, - dtype=self._dtype, - default_initializer=_get_default_param_initializer(), - ) - - self.bias = self.create_parameter( - attr=self._bias_attr, - shape=[self._num_filters], - dtype=self._dtype, - is_bias=True, - ) - - def forward(self, input): - pre_bias = self._helper.create_variable_for_type_inference( - dtype=self._dtype - ) - - self._helper.append_op( - type='conv3d', - inputs={ - 'Input': input, - 'Filter': self.weight, - }, - outputs={"Output": pre_bias}, - attrs={ - 'strides': self._stride, - 'paddings': self._padding, - 'dilations': self._dilation, - 'groups': self._groups if self._groups else 1, - 'use_cudnn': self._use_cudnn, - 'use_mkldnn': False, - }, - ) - - if self.bias is not None: - pre_act = self._helper.create_variable_for_type_inference( - dtype=self._dtype - ) - self._helper.append_op( - type='elementwise_add', - inputs={'X': [pre_bias], 'Y': [self.bias]}, - outputs={'Out': [pre_act]}, - attrs={'axis': 1}, - ) - else: - pre_act = pre_bias - - return self._helper.append_activation(pre_act, act=self._act) - - -class Conv3DTranspose(layers.Layer): - r""" - **Convlution3D transpose layer** - - The convolution3D transpose layer calculates the output based on the input, - filter, and dilations, strides, paddings. Input(Input) and output(Output) - are in NCDHW format. Where N is batch size, C is the number of channels, - D is the depth of the feature, H is the height of the feature, and W - is the width of the feature. Parameters(dilations, strides, paddings) are - two elements. These two elements represent height and width, respectively. - The details of convolution transpose layer, please refer to the following - explanation and references `therein `_. - If bias attribution and activation type are provided, bias is added to - the output of the convolution, and the corresponding activation function - is applied to the final result. - - For each input :math:`X`, the equation is: - - .. math:: - - Out = \sigma (W \\ast X + b) - - In the above equation: - - * :math:`X`: Input value, a tensor with NCDHW format. - * :math:`W`: Filter value, a tensor with MCDHW format. - * :math:`\\ast`: Convolution operation. - * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. - * :math:`\\sigma`: Activation function. - * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. - - Example: - - - Input: - - Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` - - Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)` - - - Output: - - Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` - - Where - - .. math:: - - D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\ - H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\ - W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 \\\\ - D_{out} &\in [ D^\prime_{out}, D^\prime_{out} + strides[0] ] \\\\ - H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[1] ] \\\\ - - **Note**: - - The conv3d_transpose can be seen as the backward of the conv3d. For conv3d, - when stride > 1, conv3d maps multiple input shape to the same output shape, - so for conv3d_transpose, when stride > 1, input shape maps multiple output shape. - If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \ - H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output - size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, - the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` - and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must - between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, - conv3d_transpose can compute the kernel size automatically. - - - Parameters: - num_channels(int): The number of channels in the input image. - num_filters(int): The number of the filter. It is as same as the output - image channel. - filter_size(int|tuple): The filter size. If filter_size is a tuple, - it must contain three integers, (filter_size_D, filter_size_H, filter_size_W). - Otherwise, the filter will be a square. - padding(int|tuple, optional): The padding size. The padding argument effectively - adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string, - either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding` - is a tuple or list, it could be in three forms: `[pad_depth, pad_height, pad_width]` or - `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, - and when `data_format` is `'NCDHW'`, `padding` can be in the form - `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`. - when `data_format` is `'NDHWC'`, `padding` can be in the form - `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`. - The default value is 0. - stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. - If stride is a tuple, it must contain three integers, (stride_depth, stride_height, - stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. - The default value is 1. - dilation(int|tuple, optional): The dilation size. If dilation is a tuple, it must - contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the - dilation_D = dilation_H = dilation_W = dilation. The default value is 1. - groups(int, optional): The groups number of the Conv3D transpose layer. Inspired by - grouped convolution in Alex Krizhevsky's Deep CNN paper, in which - when group=2, the first half of the filters is only connected to the - first half of the input channels, while the second half of the - filters is only connected to the second half of the input channels. - The default value is 1. - param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights - of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose - will create ParamAttr as param_attr. If the Initializer of the param_attr - is not set, the parameter is initialized with Xavier. The default value is None. - bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose. - If it is set to False, no bias will be added to the output units. - If it is set to None or one attribute of ParamAttr, conv3d_transpose - will create ParamAttr as bias_attr. If the Initializer of the bias_attr - is not set, the bias is initialized zero. The default value is None. - use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. The default value is True. - act (str, optional): Activation type, if it is set to None, activation is not appended. - The default value is None. - name(str, optional): The default value is None. Normally there is no need for user - to set this property. For more information, please refer to :ref:`api_guide_Name`. - - Attribute: - **weight** (Parameter): the learnable weights of filters of this layer. - - **bias** (Parameter): the learnable bias of this layer. - - Returns: - None. - - Raises: - ValueError: If the shapes of input, filter_size, stride, padding and - groups mismatch. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy - - with fluid.dygraph.guard(): - data = numpy.random.random((5, 3, 12, 32, 32)).astype('float32') - conv3dTranspose = fluid.dygraph.nn.Conv3DTranspose( - num_channels=3, - num_filters=12, - filter_size=12, - use_cudnn=False) - ret = conv3dTranspose(fluid.dygraph.base.to_variable(data)) - - """ - - def __init__( - self, - num_channels, - num_filters, - filter_size, - padding=0, - stride=1, - dilation=1, - groups=None, - param_attr=None, - bias_attr=None, - use_cudnn=True, - act=None, - dtype='float32', - ): - super().__init__() - if not isinstance(use_cudnn, bool): - raise ValueError("use_cudnn should be True or False") - assert ( - param_attr is not False - ), "param_attr should not be False in conv3d_transpose." - self._padding = utils.convert_to_list(padding, 3, 'padding') - self._stride = utils.convert_to_list(stride, 3, 'stride') - self._dilation = utils.convert_to_list(dilation, 3, 'dilation') - self._param_attr = param_attr - self._num_channels = num_channels - self._filter_size = filter_size - self._groups = 1 if groups is None else groups - self._num_filters = num_filters - self._use_cudnn = use_cudnn - self._bias_attr = bias_attr - self._act = act - self._dtype = dtype - - self._filter_size = utils.convert_to_list( - self._filter_size, 3, 'conv3d_transpose.filter_size' - ) - - filter_shape = [ - self._num_channels, - self._num_filters // self._groups, - ] + self._filter_size - self.weight = self.create_parameter( - dtype=self._dtype, shape=filter_shape, attr=self._param_attr - ) - self.bias = self.create_parameter( - attr=self._bias_attr, - shape=[self._num_filters], - dtype=self._dtype, - is_bias=True, - ) - - def forward(self, input): - pre_bias = self._helper.create_variable_for_type_inference( - dtype=self._dtype - ) - self._helper.append_op( - type="conv3d_transpose", - inputs={'Input': [input], 'Filter': [self.weight]}, - outputs={'Output': pre_bias}, - attrs={ - 'strides': self._stride, - 'paddings': self._padding, - 'dilations': self._dilation, - 'groups': self._groups if self._groups else 1, - 'use_cudnn': self._use_cudnn, - }, - ) - - if self._bias_attr: - pre_act = self._helper.create_variable_for_type_inference( - dtype=self._dtype - ) - self._helper.append_op( - type='elementwise_add', - inputs={'X': [pre_bias], 'Y': [self.bias]}, - outputs={'Out': [pre_act]}, - attrs={'axis': 1}, - ) - else: - pre_act = pre_bias - - # Currently, we don't support inplace in imperative mode - return self._helper.append_activation(pre_act, act=self._act) - - -class Linear(layers.Layer): - """ - - Fully-connected linear transformation layer: - - .. math:: - - Out = Act({XW + b}) - - where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively. - - Linear layer takes only one ``Tensor`` input. - The Linear layer multiplies input tensor with weight matrix and - produces an output Tensor of shape [N, *, `output_dim`], - where N is batch size and `*` means any number of additional dimensions. - If ``bias_attr`` is not None, a bias variable will be created and added to the output. - Finally, if ``act`` is not None, it will be applied to the output as well. - - Parameters: - input_dim(int): The number of input units in this layer. - output_dim(int): The number of output units in this layer. - param_attr(ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable - weights(Parameter) of this layer. Default: None. - bias_attr(ParamAttr or list of ParamAttr, optional): The attribute for the bias - of this layer. If it is set to False, no bias will be added to the output units. - If it is set to None, the bias is initialized zero. Default: None. - act(str, optional): Activation to be applied to the output of this layer. Default: None. - dtype(str, optional): Dtype used for weight, it can be "float32" or "float64". Default: "float32". - - Attributes: - **weight** (Parameter): the learnable weights of this layer. - - **bias** (Parameter or None): the learnable bias of this layer. - - Returns: - None - - Examples: - .. code-block:: python - - from paddle.fluid.dygraph.base import to_variable - import paddle.fluid as fluid - from paddle.fluid.dygraph import Linear - import numpy as np - - data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32') - with fluid.dygraph.guard(): - linear = Linear(32, 64) - data = to_variable(data) - res = linear(data) # [30, 10, 64] - """ - - def __init__( - self, - input_dim, - output_dim, - param_attr=None, - bias_attr=None, - act=None, - dtype="float32", - ): - super().__init__() - self._act = act - self._dtype = dtype - self.weight = self.create_parameter( - shape=[input_dim, output_dim], - attr=param_attr, - dtype=dtype, - is_bias=False, - ) - self.bias = self.create_parameter( - shape=[output_dim], attr=bias_attr, dtype=dtype, is_bias=True - ) - - self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"] - - def forward(self, input): - if _non_static_mode(): - pre_bias = _varbase_creator(dtype=input.dtype) - _legacy_C_ops.matmul( - input, - self.weight, - pre_bias, - 'transpose_X', - False, - 'transpose_Y', - False, - "alpha", - 1, - "use_mkldnn", - self._use_mkldnn, - ) - pre_act = dygraph_utils._append_bias_in_dygraph( - pre_bias, - self.bias, - axis=len(input.shape) - 1, - use_mkldnn=self._use_mkldnn, - ) - - return dygraph_utils._append_activation_in_dygraph( - pre_act, self._act, use_mkldnn=self._use_mkldnn - ) - - check_variable_and_dtype( - input, 'input', ['float16', 'float32', 'float64'], "Linear" - ) - - attrs = { - "transpose_X": False, - "transpose_Y": False, - "alpha": 1, - "use_mkldnn": self._use_mkldnn, - } - inputs = {"X": [input], "Y": [self.weight]} - - tmp = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="matmul", inputs=inputs, outputs={"Out": tmp}, attrs=attrs - ) - if self.bias is not None: - pre_activation = self._helper.create_variable_for_type_inference( - dtype=self._dtype - ) - self._helper.append_op( - type='elementwise_add', - inputs={'X': [tmp], 'Y': [self.bias]}, - outputs={'Out': [pre_activation]}, - attrs={ - 'axis': len(input.shape) - 1, - 'use_mkldnn': self._use_mkldnn, - }, - ) - else: - pre_activation = tmp - return self._helper.append_activation(pre_activation, act=self._act) - - class BatchNorm(layers.Layer): r""" diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 0d2cd0cbf2db0c9e52b7379813253b3e398ce6b0..f52ba97066c1b1390c61a0411b0a2a1cbf1d61a6 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -165,12 +165,12 @@ def monkey_patch_varbase(): import paddle.fluid as fluid from paddle.fluid.dygraph.base import to_variable - from paddle.fluid.dygraph import Linear + from paddle.nn import Linear import numpy as np data = np.ones([3, 1024], dtype='float32') with fluid.dygraph.guard(): - linear = fluid.dygraph.Linear(1024, 4) + linear = Linear(1024, 4) t = to_variable(data) linear(t) # call with default weight custom_weight = np.random.randn(1024, 4).astype("float32") diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py index ce8c7fb22a7584ea68e64addc731452ab66a7add..5756361f89e46f005072d2136b2e13de4762525b 100644 --- a/python/paddle/fluid/install_check.py +++ b/python/paddle/fluid/install_check.py @@ -39,8 +39,10 @@ __all__ = ['run_check'] class SimpleLayer(Layer): def __init__(self, input_size): super().__init__() - self._linear1 = nn.Linear( - input_size, 3, param_attr=ParamAttr(initializer=Constant(value=0.1)) + self._linear1 = paddle.nn.Linear( + input_size, + 3, + weight_attr=ParamAttr(initializer=Constant(value=0.1)), ) def forward(self, inputs): diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index ea10b49e9cc6f4abcf340900f60f2fd239ef0cfd..ac5d70707903aef8484d735bf3a9273d931f7e2d 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -475,9 +475,10 @@ class Optimizer: .. code-block:: python import paddle.fluid as fluid + import paddle with fluid.dygraph.guard(): - linear = fluid.dygraph.nn.Linear(10, 10) + linear = paddle.nn.Linear(10, 10) adam = fluid.optimizer.Adam(0.1, parameter_list=linear.parameters()) @@ -576,6 +577,7 @@ class Optimizer: import paddle.fluid as fluid import numpy as np + import paddle # example1: LearningRateDecay is not used, return value is all the same with fluid.dygraph.guard(): @@ -587,10 +589,10 @@ class Optimizer: # example2: PiecewiseDecay is used, return the step learning rate with fluid.dygraph.guard(): inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") - linear = fluid.dygraph.nn.Linear(10, 10) + linear = paddle.nn.Linear(10, 10) inp = fluid.dygraph.to_variable(inp) out = linear(inp) - loss = fluid.layers.reduce_mean(out) + loss = paddle.mean(out) bd = [2, 4, 6, 8] value = [0.2, 0.4, 0.6, 0.8, 1.0] @@ -1340,12 +1342,13 @@ class Optimizer: .. code-block:: python import paddle.fluid as fluid + import paddle import numpy as np with fluid.dygraph.guard(): value = np.arange(26).reshape(2, 13).astype("float32") a = fluid.dygraph.to_variable(value) - linear = fluid.Linear(13, 5, dtype="float32") + linear = paddle.nn.Linear(13, 5) # This can be any optimizer supported by dygraph. adam = fluid.optimizer.Adam(learning_rate = 0.01, parameter_list = linear.parameters()) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py index 52ec9e5b121b2fdcba2ae9177118242d286e50a2..8d49434ac54e8bdfec82931467032d6d3f24eea7 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py @@ -18,7 +18,7 @@ from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main import paddle import paddle.fluid as fluid import paddle.nn.functional as F -from paddle.fluid.dygraph import Embedding, Layer, Linear, to_variable +from paddle.fluid.dygraph import Embedding, Layer, to_variable from paddle.optimizer.lr import NoamDecay """ @@ -269,8 +269,8 @@ class PrePostProcessLayer(Layer): class PositionwiseFeedForwardLayer(Layer): def __init__(self, d_inner_hid, d_hid, dropout_rate): super().__init__() - self._i2h = Linear(d_hid, d_inner_hid, act="relu") - self._h2o = Linear(d_inner_hid, d_hid) + self._i2h = paddle.nn.Linear(d_hid, d_inner_hid) + self._h2o = paddle.nn.Linear(d_inner_hid, d_hid) self._dropout_rate = dropout_rate def forward(self, x): @@ -304,10 +304,18 @@ class MultiHeadAttentionLayer(Layer): self._d_value = d_value self._d_model = d_model self._dropout_rate = dropout_rate - self._q_fc = Linear(self._d_model, d_key * n_head, bias_attr=False) - self._k_fc = Linear(self._d_model, d_key * n_head, bias_attr=False) - self._v_fc = Linear(self._d_model, d_value * n_head, bias_attr=False) - self._proj_fc = Linear(d_value * n_head, self._d_model, bias_attr=False) + self._q_fc = paddle.nn.Linear( + self._d_model, d_key * n_head, bias_attr=False + ) + self._k_fc = paddle.nn.Linear( + self._d_model, d_key * n_head, bias_attr=False + ) + self._v_fc = paddle.nn.Linear( + self._d_model, d_value * n_head, bias_attr=False + ) + self._proj_fc = paddle.nn.Linear( + d_value * n_head, self._d_model, bias_attr=False + ) def forward(self, queries, keys, values, attn_bias): # compute q ,k ,v @@ -825,7 +833,9 @@ class WrapDecoderLayer(Layer): ) self._weight_sharing = weight_sharing if not weight_sharing: - self._fc = Linear(d_model, trg_vocab_size, bias_attr=False) + self._fc = paddle.nn.Linear( + d_model, trg_vocab_size, bias_attr=False + ) def forward(self, dec_inputs=None, enc_output=None): trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs