提交 e830dfa9 编写于 作者: X xiteng1988

add ofa demo

上级 a596e2b6
from .once_for_all_kernel import OFA_kernel
from .mobilenet_v1 import MobileNetV1
from .mobilenet_v2 import MobileNetV2
from .mobilenet_v3 import MobileNetV3_dy
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from __future__ import print_function
__all__ = ['conv2d', 'conv2d_transpose', 'conv3d', 'conv3d_transpose']
import numpy as np
from paddle.fluid.framework import Variable, in_dygraph_mode
from paddle.fluid import core, dygraph_utils
from paddle.fluid.layers import nn, utils
from paddle.fluid.data_feeder import check_variable_and_dtype
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.layer_helper import LayerHelper
def _is_list_or_tuple(input):
return isinstance(input, (list, tuple))
def _zero_padding_in_batch_and_channel(padding, channel_last):
if channel_last:
return list(padding[0]) == [0, 0] and list(padding[-1]) == [0, 0]
else:
return list(padding[0]) == [0, 0] and list(padding[1]) == [0, 0]
def _exclude_padding_in_batch_and_channel(padding, channel_last):
padding_ = padding[1:-1] if channel_last else padding[2:]
padding_ = [elem for pad_a_dim in padding_ for elem in pad_a_dim]
return padding_
def _update_padding_nd(padding, channel_last, num_dims):
if isinstance(padding, str):
padding = padding.upper()
if padding not in ["SAME", "VALID"]:
raise ValueError(
"Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".
format(padding))
if padding == "VALID":
padding_algorithm = "VALID"
padding = [0] * num_dims
else:
padding_algorithm = "SAME"
padding = [0] * num_dims
elif _is_list_or_tuple(padding):
# for padding like
# [(pad_before, pad_after), (pad_before, pad_after), ...]
# padding for batch_dim and channel_dim included
if len(padding) == 2 + num_dims and _is_list_or_tuple(padding[0]):
if not _zero_padding_in_batch_and_channel(padding, channel_last):
raise ValueError(
"Non-zero padding({}) in the batch or channel dimensions "
"is not supported.".format(padding))
padding_algorithm = "EXPLICIT"
padding = _exclude_padding_in_batch_and_channel(padding,
channel_last)
if utils._is_symmetric_padding(padding, num_dims):
padding = padding[0::2]
# for padding like [pad_before, pad_after, pad_before, pad_after, ...]
elif len(padding) == 2 * num_dims and isinstance(padding[0], int):
padding_algorithm = "EXPLICIT"
padding = utils.convert_to_list(padding, 2 * num_dims, 'padding')
if utils._is_symmetric_padding(padding, num_dims):
padding = padding[0::2]
# for padding like [pad_d1, pad_d2, ...]
elif len(padding) == num_dims and isinstance(padding[0], int):
padding_algorithm = "EXPLICIT"
padding = utils.convert_to_list(padding, num_dims, 'padding')
else:
raise ValueError("In valid padding: {}".format(padding))
# for integer padding
else:
padding_algorithm = "EXPLICIT"
padding = utils.convert_to_list(padding, num_dims, 'padding')
return padding, padding_algorithm
def conv2d(input,
weight,
bias=None,
padding=0,
stride=1,
dilation=1,
groups=1,
use_cudnn=True,
act=None,
data_format="NCHW",
name=None):
"""
:alias_main: paddle.nn.functional.conv2d
:alias: paddle.nn.functional.conv2d,paddle.nn.functional.conv.conv2d
The convolution2D layer calculates the output based on the input, filter
and strides, paddings, dilations, groups parameters. Input and
Output are in NCHW or NHWC format, where N is batch size, C is the number of
channels, H is the height of the feature, and W is the width of the feature.
Filter is in MCHW format, where M is the number of output image channels,
C is the number of input image channels, H is the height of the filter,
and W is the width of the filter. If the groups is greater than 1,
C will equal the number of input image channels divided by the groups.
Please refer to UFLDL's `convolution
<http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
for more details.
If bias attribution and activation type are provided, bias is added to the
output of the convolution, and the corresponding activation function is
applied to the final result.
For each input :math:`X`, the equation is:
.. math::
Out = \sigma (W \\ast X + b)
Where:
* :math:`X`: Input value, a tensor with NCHW or NHWC format.
* :math:`W`: Filter value, a tensor with MCHW format.
* :math:`\\ast`: Convolution operation.
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
* :math:`\\sigma`: Activation function.
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
Example:
- Input:
Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
- Output:
Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
Where
.. math::
H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
Args:
input (Variable): The input is 4-D Tensor with shape [N, C, H, W], the data type
of input is float16 or float32 or float64.
weight (Variable): The convolution kernel with shape [M, C/g, kH, kW], where M is
the number of output channels, g is the number of groups, kH is the filter's
height, kW is the filter's width.
bias (Variable, optional): The bias with shape [M,].
padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
on both sides for each dimension.If `padding` is a string, either 'VALID' or
'SAME' which is the padding algorithm. If padding size is a tuple or list,
it could be in three forms: `[pad_height, pad_width]` or
`[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when
`data_format` is `"NCHW"`, `padding` can be in the form `[[0,0], [0,0],
[pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
when `data_format` is `"NHWC"`, `pool_padding` can be in the form
`[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
Default: padding = 0.
stride (int|tuple): The stride size. It means the stride in convolution.
If stride is a tuple, it must contain two integers, (stride_height, stride_width).
Otherwise, stride_height = stride_width = stride. Default: stride = 1.
dilation (int|tuple): The dilation size. It means the spacing between the kernel
points. If dilation is a tuple, it must contain two integers, (dilation_height,
dilation_width). Otherwise, dilation_height = dilation_width = dilation.
Default: dilation = 1.
groups (int): The groups number of the Conv2d Layer. According to grouped
convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
the first half of the filters is only connected to the first half
of the input channels, while the second half of the filters is only
connected to the second half of the input channels. Default: groups=1.
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True
act (str): Activation type, if it is set to None, activation is not appended.
Default: None
data_format (str, optional): Specify the data format of the input, and the data format of the output
will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
`[batch_size, input_channels, input_height, input_width]`.
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns:
A Variable holding Tensor representing the conv2d, whose data type is the
same with input. If act is None, the tensor variable storing the convolution
result, and if act is not None, the tensor variable storing convolution
and non-linearity activation result.
Raises:
ValueError: If the type of `use_cudnn` is not bool.
ValueError: If `data_format` is not "NCHW" or "NHWC".
ValueError: If the channel dimmention of the input is less than or equal to zero.
ValueError: If `padding` is a string, but not "SAME" or "VALID".
ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0
or the element corresponding to the input's channel is not 0.
ShapeError: If the input is not 4-D Tensor.
ShapeError: If the input's dimension size and filter's dimension size not equal.
ShapeError: If the dimension size of input minus the size of `stride` is not 2.
ShapeError: If the number of input channels is not equal to filter's channels * groups.
ShapeError: If the number of output channels is not be divided by groups.
Examples:
.. code-block:: python
from paddle import fluid
import paddle.nn.functional as F
import paddle.fluid.dygraph as dg
import numpy as np
x = np.random.randn(2, 3, 8, 8).astype(np.float32)
w = np.random.randn(6, 3, 3, 3).astype(np.float32)
place = fluid.CPUPlace()
with dg.guard(place):
x_var = dg.to_variable(x)
w_var = dg.to_variable(w)
y_var = F.conv2d(x_var, w_var, act="relu")
y_np = y_var.numpy()
print(y_np.shape)
# (2, 6, 6, 6)
"""
# entry checks
if not isinstance(use_cudnn, bool):
raise ValueError("Attr(use_cudnn) should be True or False. "
"Received Attr(use_cudnn): {}.".format(use_cudnn))
if data_format not in ["NCHW", "NHWC"]:
raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
"Received Attr(data_format): {}.".format(data_format))
channel_last = (data_format == "NHWC")
channel_dim = -1 if channel_last else 1
num_channels = input.shape[channel_dim]
num_filters = weight.shape[0]
if num_channels < 0:
raise ValueError("The channel dimmention of the input({}) "
"should be defined. Received: {}.".format(
input.shape, num_channels))
if num_channels % groups != 0:
raise ValueError(
"the channel of input must be divisible by groups,"
"received: the channel of input is {}, the shape of input is {}"
", the groups is {}".format(num_channels, input.shape, groups))
if num_filters % groups != 0:
raise ValueError(
"the number of filters must be divisible by groups,"
"received: the number of filters is {}, the shape of weight is {}"
", the groups is {}".format(num_filters, weight.shape, groups))
# update attrs
padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
stride = utils.convert_to_list(stride, 2, 'stride')
dilation = utils.convert_to_list(dilation, 2, 'dilation')
l_type = "conv2d"
if (num_channels == groups and num_filters % num_channels == 0 and
not use_cudnn):
l_type = 'depthwise_conv2d'
inputs = {'Input': [input], 'Filter': [weight]}
attrs = {
'strides': stride,
'paddings': padding,
'dilations': dilation,
'groups': groups,
'use_cudnn': use_cudnn,
'use_mkldnn': False,
'fuse_relu_before_depthwise_conv': False,
"padding_algorithm": padding_algorithm,
"data_format": data_format
}
if in_dygraph_mode():
attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
padding_algorithm, "data_format", data_format)
pre_bias = getattr(core.ops, l_type)(input, weight, *attrs)
if bias is not None:
pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
else:
pre_act = pre_bias
out = dygraph_utils._append_activation_in_dygraph(
pre_act, act, use_cudnn=use_cudnn)
else:
inputs = {'Input': [input], 'Filter': [weight]}
attrs = {
'strides': stride,
'paddings': padding,
'dilations': dilation,
'groups': groups,
'use_cudnn': use_cudnn,
'use_mkldnn': False,
'fuse_relu_before_depthwise_conv': False,
"padding_algorithm": padding_algorithm,
"data_format": data_format
}
check_variable_and_dtype(input, 'input',
['float16', 'float32', 'float64'], 'conv2d')
helper = LayerHelper(l_type, **locals())
dtype = helper.input_dtype()
pre_bias = helper.create_variable_for_type_inference(dtype)
outputs = {"Output": [pre_bias]}
helper.append_op(
type=l_type, inputs=inputs, outputs=outputs, attrs=attrs)
if bias is not None:
pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
else:
pre_act = pre_bias
out = helper.append_activation(pre_act)
return out
def conv2d_transpose(input,
weight,
bias=None,
output_size=None,
padding=0,
stride=1,
dilation=1,
groups=1,
use_cudnn=True,
act=None,
data_format='NCHW',
name=None):
"""
:alias_main: paddle.nn.functional.conv2d_transpose
:alias: paddle.nn.functional.conv2d_transpose,paddle.nn.functional.conv.conv2d_transpose
The convolution2D transpose layer calculates the output based on the input,
filter, and dilations, strides, paddings. Input(Input) and output(Output)
are in NCHW or NHWC format. Where N is batch size, C is the number of channels,
H is the height of the feature, and W is the width of the feature.
Parameters(dilations, strides, paddings) are two elements. These two elements
represent height and width, respectively. The details of convolution transpose
layer, please refer to the following explanation and references
`therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
If bias attribution and activation type are provided, bias is added to
the output of the convolution, and the corresponding activation function
is applied to the final result.
For each input :math:`X`, the equation is:
.. math::
Out = \sigma (W \\ast X + b)
Where:
* :math:`X`: Input value, a 4-D Tensor with NCHW or NHWC format.
* :math:`W`: Filter value, a 4-D Tensor with MCHW format.
* :math:`\\ast`: Convolution operation.
* :math:`b`: Bias value, a 2-D Tensor with shape [M, 1].
* :math:`\\sigma`: Activation function.
* :math:`Out`: Output value, a 4-D Tensor with data format 'NCHW' or 'NHWC', the shape of :math:`Out` and :math:`X` may be different.
Example:
- Input:
Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
- Output:
Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
Where
.. math::
H^\prime_{out} &= (H_{in} - 1) * strides[0] - pad_height_top - pad_height_bottom + dilations[0] * (H_f - 1) + 1 \\\\
W^\prime_{out} &= (W_{in} - 1) * strides[1] - pad_width_left - pad_width_right + dilations[1] * (W_f - 1) + 1 \\\\
H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ] \\\\
W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] ]
Note:
The conv2d_transpose can be seen as the backward of the conv2d. For conv2d,
when stride > 1, conv2d maps multiple input shape to the same output shape,
so for conv2d_transpose, when stride > 1, input shape maps multiple output shape.
If output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`;
else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}`
and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must
between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`,
conv2d_transpose can compute the kernel size automatically.
Args:
input(Variable): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format,
whose data type is float32 or float64.
weight(Variable): The convolution kernel, a Tensor with shape [C, M/g, kH, kW],
where M is the number of output channels(filters), g is the number of groups,
kH is the height of the kernel, and kW is the width of the kernel.
bias(Variable, optional): The bias, a Tensor with shape [M, ].
output_size(int|tuple|list, optional): The output image size. If output size is a
tuple, it must contain two integers, (image_height, image_width). None if use
filter_size, padding, and stride to calculate output_size.
If output_size is specified, output_size and filter_size (weight)'s shape
should follow the formula above. Default: None. output_size and filter_size
should not be None at the same time.
padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
`dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
If `padding` is a tuple or list, it could be in three forms:
`[pad_height, pad_width]` or
`[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and
when `data_format` is `'NCHW'`,
`padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
when `data_format` is `'NHWC'`, `padding` can be in the form
`[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
Default: padding = 0.
stride(int|tuple, optional): The stride size. It means the stride in transposed convolution.
If stride is a tuple, it must contain two integers, (stride_height, stride_width).
Otherwise, stride_height = stride_width = stride. Default: stride = 1.
dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points.
If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width).
Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
when group=2, the first half of the filters is only connected to the
first half of the input channels, while the second half of the
filters is only connected to the second half of the input channels.
Default: groups = 1.
use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True.
act (str, optional): Activation type, if it is set to None, activation is not appended.
Default: None.
data_format (str, optional): Specify the data format of the input, and the data format of the output
will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
`[batch_size, input_channels, input_height, input_width]`.
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns:
A Variable holding Tensor representing the conv2d_transpose, whose
data type is the same with input and shape is (num_batches, channels, out_h,
out_w) or (num_batches, out_h, out_w, channels). If act is None, the tensor variable
storing the transposed convolution result, and if act is not None, the
tensor variable storing transposed convolution and non-linearity activation
result.
Raises:
ValueError: If the type of `use_cudnn` is not bool.
ValueError: If `data_format` is not "NCHW" or "NHWC".
ValueError: If `padding` is a string, but not "SAME" or "VALID".
ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0
or the element corresponding to the input's channel is not 0.
ValueError: If `output_size` and filter_size are None at the same time.
ShapeError: If the input is not 4-D Tensor.
ShapeError: If the input's dimension size and filter's dimension size not equal.
ShapeError: If the dimension size of input minus the size of `stride` is not 2.
ShapeError: If the number of input channels is not equal to filter's channels.
ShapeError: If the size of `output_size` is not equal to that of `stride`.
Examples:
.. code-block:: python
from paddle import fluid
import paddle.nn.functional as F
import paddle.fluid.dygraph as dg
import numpy as np
x = np.random.randn(2, 3, 8, 8).astype(np.float32)
w = np.random.randn(3, 6, 3, 3).astype(np.float32)
place = fluid.CPUPlace()
with dg.guard(place):
x_var = dg.to_variable(x)
w_var = dg.to_variable(w)
y_var = F.conv2d_transpose(x_var, w_var, act="relu")
y_np = y_var.numpy()
print(y_np.shape)
# (2, 6, 10, 10)
"""
if not isinstance(use_cudnn, bool):
raise ValueError("Attr(use_cudnn) should be True or False. "
"Received Attr(use_cudnn): {}.".format(use_cudnn))
if data_format not in ['NCHW', 'NHWC']:
raise ValueError(
"Attr(data_format) of conv2d_transpose got wrong value: "
"received {}, but only 'NCHW' or 'NHWC' are supported.".format(
data_format))
channel_last = (data_format == "NHWC")
channel_dim = -1 if channel_last else 1
num_channels = input.shape[channel_dim]
if num_channels < 0:
raise ValueError("The channel dimmention of the input({}) "
"should be defined. Received: {}.".format(
input.shape, num_channels))
if num_channels % groups != 0:
raise ValueError(
"the channel of input must be divisible by groups,"
"received: the channel of input is {}, the shape of input is {}"
", the groups is {}".format(num_channels, input.shape, groups))
# update attrs
padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
stride = utils.convert_to_list(stride, 2, 'stride')
dilation = utils.convert_to_list(dilation, 2, 'dilation')
if output_size is None:
output_size = []
elif isinstance(output_size, (list, tuple, int)):
output_size = utils.convert_to_list(output_size, 2, 'output_size')
else:
raise ValueError("output_size should be int, or list, tuple of ints")
op_type = 'conv2d_transpose'
num_filters = weight.shape[1]
if (num_channels == groups and num_filters == 1 and not use_cudnn):
op_type = 'depthwise_conv2d_transpose'
if in_dygraph_mode():
attrs = ('output_size', output_size, 'strides', stride, 'paddings',
padding, 'padding_algorithm', padding_algorithm, 'dilations',
dilation, 'groups', groups, 'use_cudnn', use_cudnn,
'data_format', data_format)
pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
if bias is not None:
pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
else:
pre_act = pre_bias
out = dygraph_utils._append_activation_in_dygraph(
pre_act, act, use_cudnn=use_cudnn)
else:
inputs = {'Input': [input], 'Filter': [weight]}
attrs = {
'output_size': output_size,
'strides': stride,
'paddings': padding,
'padding_algorithm': padding_algorithm,
'dilations': dilation,
'groups': groups,
'use_cudnn': use_cudnn,
'data_format': data_format
}
check_variable_and_dtype(input, 'input',
['float16', 'float32', 'float64'],
'conv2d_transpose')
helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype()
pre_bias = helper.create_variable_for_type_inference(dtype)
outputs = {"Output": [pre_bias]}
helper.append_op(
type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
if bias is not None:
pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
else:
pre_act = pre_bias
out = helper.append_activation(pre_act)
return out
def conv3d(input,
weight,
bias=None,
padding=0,
stride=1,
dilation=1,
groups=1,
use_cudnn=True,
act=None,
data_format="NCDHW",
name=None):
"""
:alias_main: paddle.nn.functional.conv3d
:alias: paddle.nn.functional.conv3d,paddle.nn.functional.conv.conv3d
The convolution3D layer calculates the output based on the input, filter
and strides, paddings, dilations, groups parameters. Input(Input) and
Output(Output) are in NCDHW or NDHWC format. Where N is batch size C is the number of
channels, D is the depth of the feature, H is the height of the feature,
and W is the width of the feature. Convlution3D is similar with Convlution2D
but adds one dimension(depth). If bias attribution and activation type are
provided, bias is added to the output of the convolution, and the
corresponding activation function is applied to the final result.
For each input :math:`X`, the equation is:
.. math::
Out = \sigma (W \\ast X + b)
In the above equation:
* :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
* :math:`W`: Filter value, a tensor with MCDHW format.
* :math:`\\ast`: Convolution operation.
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
* :math:`\\sigma`: Activation function.
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
Example:
- Input:
Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
- Output:
Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
Where
.. math::
D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
Args:
input (Variable): The input is 5-D Tensor with shape [N, C, D, H, W], the data
type of input is float16 or float32 or float64.
weight (Variable): The convolution kernel, a Tensor with shape [M, C/g, kD, kH, kW],
where M is the number of filters(output channels), g is the number of groups,
kD, kH, kW are the filter's depth, height and width respectively.
bias (Variable, optional): The bias, a Tensor of shape [M, ].
padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
on both sides for each dimension. If `padding` is a string, either 'VALID' or
'SAME' which is the padding algorithm. If padding size is a tuple or list,
it could be in three forms: `[pad_depth, pad_height, pad_width]` or
`[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
`[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
`[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
Default: padding = 0.
stride (int|tuple): The stride size. It means the stride in convolution. If stride is a
tuple, it must contain three integers, (stride_depth, stride_height, stride_width).
Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
dilation (int|tuple): The dilation size. It means the spacing between the kernel points.
If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height,
dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation.
Default: dilation = 1.
groups (int): The groups number of the Conv3d Layer. According to grouped
convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
the first half of the filters is only connected to the first half
of the input channels, while the second half of the filters is only
connected to the second half of the input channels. Default: groups=1
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True
act (str): Activation type, if it is set to None, activation is not appended.
Default: None.
data_format (str, optional): Specify the data format of the input, and the data format of the output
will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
`[batch_size, input_channels, input_height, input_width]`.
name(str|None): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns:
A Variable holding Tensor representing the conv3d, whose data type is
the same with input. If act is None, the tensor variable storing the
convolution result, and if act is not None, the tensor variable storing
convolution and non-linearity activation result.
Raises:
ValueError: If the type of `use_cudnn` is not bool.
ValueError: If `data_format` is not "NCDHW" or "NDHWC".
ValueError: If the channel dimmention of the input is less than or equal to zero.
ValueError: If `padding` is a string, but not "SAME" or "VALID".
ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0
or the element corresponding to the input's channel is not 0.
ShapeError: If the input is not 5-D Tensor.
ShapeError: If the input's dimension size and filter's dimension size not equal.
ShapeError: If the dimension size of input minus the size of `stride` is not 2.
ShapeError: If the number of input channels is not equal to filter's channels * groups.
ShapeError: If the number of output channels is not be divided by groups.
Examples:
.. code-block:: python
from paddle import fluid
import paddle.nn.functional as F
import paddle.fluid.dygraph as dg
import numpy as np
x = np.random.randn(2, 3, 8, 8, 8).astype(np.float32)
w = np.random.randn(6, 3, 3, 3, 3).astype(np.float32)
place = fluid.CPUPlace()
with dg.guard(place):
x_var = dg.to_variable(x)
w_var = dg.to_variable(w)
y_var = F.conv3d(x_var, w_var, act="relu")
y_np = y_var.numpy()
print(y_np.shape)
# (2, 6, 6, 6, 6)
"""
# entry check
if not isinstance(use_cudnn, bool):
raise ValueError("Attr(use_cudnn) should be True or False. Received "
"Attr(use_cudnn): {}. ".format(use_cudnn))
if data_format not in ["NCDHW", "NDHWC"]:
raise ValueError(
"Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
"Attr(data_format): {}.".format(data_format))
channel_last = (data_format == "NDHWC")
channel_dim = -1 if channel_last else 1
num_channels = input.shape[channel_dim]
num_filters = weight.shape[0]
if num_channels < 0:
raise ValueError(
"The channel dimmention of the input({}) should be defined. "
"Received: {}.".format(input.shape, num_channels))
if num_channels % groups != 0:
raise ValueError(
"The number of input channels must be divisible by Attr(groups). "
"Received: number of channels({}), groups({}).".format(num_channels,
groups))
if num_filters % groups != 0:
raise ValueError(
"The number of filters must be divisible by Attr(groups). "
"Received: number of filters({}), groups({}).".format(num_filters,
groups))
padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
stride = utils.convert_to_list(stride, 3, 'stride')
dilation = utils.convert_to_list(dilation, 3, 'dilation')
op_type = "conv3d"
if in_dygraph_mode():
attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
"padding_algorithm", padding_algorithm, "data_format",
data_format)
pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
if bias is not None:
pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
else:
pre_act = pre_bias
out = dygraph_utils._append_activation_in_dygraph(
pre_act, act, use_cudnn=use_cudnn)
else:
inputs = {'Input': [input], 'Filter': [weight]}
attrs = {
'strides': stride,
'paddings': padding,
'dilations': dilation,
'groups': groups,
'use_cudnn': use_cudnn,
'use_mkldnn': False,
"padding_algorithm": padding_algorithm,
"data_format": data_format
}
helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype()
check_variable_and_dtype(input, 'input',
['float16', 'float32', 'float64'], 'conv3d')
pre_bias = helper.create_variable_for_type_inference(dtype)
outputs = {"Output": [pre_bias]}
helper.append_op(
type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
if bias is not None:
pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
else:
pre_act = pre_bias
out = helper.append_activation(pre_act)
return out
def conv3d_transpose(input,
weight,
bias=None,
output_size=None,
padding=0,
stride=1,
dilation=1,
groups=1,
use_cudnn=True,
act=None,
data_format='NCDHW',
name=None):
"""
:alias_main: paddle.nn.functional.conv3d_transpose
:alias: paddle.nn.functional.conv3d_transpose,paddle.nn.functional.conv.conv3d_transpose
The convolution3D transpose layer calculates the output based on the input,
filter, and dilations, strides, paddings. Input(Input) and output(Output)
are in NCDHW or NDHWC format. Where N is batch size, C is the number of channels,
D is the depth of the feature, H is the height of the feature, and W
is the width of the feature. Parameters(dilations, strides, paddings) are
two elements. These two elements represent height and width, respectively.
The details of convolution transpose layer, please refer to the following
explanation and references `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
If bias attribution and activation type are provided, bias is added to
the output of the convolution, and the corresponding activation function
is applied to the final result.
For each input :math:`X`, the equation is:
.. math::
Out = \sigma (W \\ast X + b)
In the above equation:
* :math:`X`: Input value, a Tensor with NCDHW or NDHWC format.
* :math:`W`: Filter value, a Tensor with MCDHW format.
* :math:`\\ast`: Convolution operation.
* :math:`b`: Bias value, a 2-D Tensor with shape [M, 1].
* :math:`\\sigma`: Activation function.
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
Example:
- Input:
Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
- Output:
Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
Where
.. math::
D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 \\\\
D_{out} &\in [ D^\prime_{out}, D^\prime_{out} + strides[0] ] \\\\
H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[1] ] \\\\
W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[2] ]
Note:
The conv3d_transpose can be seen as the backward of the conv3d. For conv3d,
when stride > 1, conv3d maps multiple input shape to the same output shape,
so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output
size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`,
the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}`
and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must
between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`,
conv3d_transpose can compute the kernel size automatically.
Args:
input(Variable): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type
of input is float32 or float64.
weight (Variable): The convolution kernel, a Tensor with shape [C, M/g, kD, kH, kW],
where M is the number of filters(output channels), g is the number of groups,
kD, kH, kW are the filter's depth, height and width respectively.
bias (Variable, optional): The bias, a Tensor of shape [M, ].
output_size(int|tuple, optional): The output image size. If output size is a
tuple, it must contain three integers, (image_depth, image_height, image_width). This
parameter only works when filter_size is None. If output_size and filter_size are
specified at the same time, They should follow the formula above. Default: None.
Output_size and filter_size should not be None at the same time.
padding(int|list|str|tuple, optional): The padding size. The padding argument effectively
adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string,
either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding`
is a tuple or list, it could be in three forms: `[pad_depth, pad_height, pad_width]` or
`[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
and when `data_format` is `'NCDHW'`, `padding` can be in the form
`[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
when `data_format` is `'NDHWC'`, `padding` can be in the form
`[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
Default: padding = 0.
stride(int|tuple, optional): The stride size. It means the stride in transposed convolution.
If stride is a tuple, it must contain three integers, (stride_depth, stride_height,
stride_width). Otherwise, stride_depth = stride_height = stride_width = stride.
Default: stride = 1.
dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points.
If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height,
dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation.
Default: dilation = 1.
groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
when group=2, the first half of the filters is only connected to the
first half of the input channels, while the second half of the
filters is only connected to the second half of the input channels.
Default: groups=1
use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True
act (str, optional): Activation type, if it is set to None, activation is not appended.
Default: None.
data_format (str, optional): Specify the data format of the input, and the data format of the output
will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
`[batch_size, input_channels, input_height, input_width]`.
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns:
A Variable holding Tensor representing the conv3d_transpose, whose data
type is the same with input and shape is (num_batches, channels, out_d, out_h,
out_w) or (num_batches, out_d, out_h, out_w, channels). If act is None, the tensor
variable storing the transposed convolution result, and if act is not None, the tensor
variable storing transposed convolution and non-linearity activation result.
Raises:
ValueError: If the type of `use_cudnn` is not bool.
ValueError: If `data_format` is not "NCDHW" or "NDHWC".
ValueError: If `padding` is a string, but not "SAME" or "VALID".
ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0
or the element corresponding to the input's channel is not 0.
ValueError: If `output_size` and filter_size are None at the same time.
ShapeError: If the input is not 5-D Tensor.
ShapeError: If the input's dimension size and filter's dimension size not equal.
ShapeError: If the dimension size of input minus the size of `stride` is not 2.
ShapeError: If the number of input channels is not equal to filter's channels.
ShapeError: If the size of `output_size` is not equal to that of `stride`.
Examples:
.. code-block:: python
from paddle import fluid
import paddle.nn.functional as F
import paddle.fluid.dygraph as dg
import numpy as np
x = np.random.randn(2, 3, 8, 8, 8).astype(np.float32)
w = np.random.randn(3, 6, 3, 3, 3).astype(np.float32)
place = fluid.CPUPlace()
with dg.guard(place):
x_var = dg.to_variable(x)
w_var = dg.to_variable(w)
y_var = F.conv3d_transpose(x_var, w_var, act="relu")
y_np = y_var.numpy()
print(y_np.shape)
# (2, 6, 10, 10, 10)
"""
# entry checks
if not isinstance(use_cudnn, bool):
raise ValueError("Attr(use_cudnn) should be True or False. "
"Received Attr(use_cudnn): {}.".format(use_cudnn))
if data_format not in ["NCDHW", "NDHWC"]:
raise ValueError(
"Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
"Attr(data_format): {}.".format(data_format))
channel_last = (data_format == "NDHWC")
channel_dim = -1 if channel_last else 1
num_channels = input.shape[channel_dim]
num_filters = weight.shape[1]
if num_channels < 0:
raise ValueError(
"The channel dimmention of the input({}) should be defined. "
"Received: {}.".format(input.shape, num_channels))
if num_channels % groups != 0:
raise ValueError(
"The number of input channels must be divisible by Attr(groups). "
"Received: number of channels({}), groups({}).".format(num_channels,
groups))
padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
stride = utils.convert_to_list(stride, 3, 'stride')
dilation = utils.convert_to_list(dilation, 3, 'dilation')
if output_size is None:
output_size = []
elif isinstance(output_size, (list, tuple, int)):
output_size = utils.convert_to_list(output_size, 3, 'output_size')
else:
raise ValueError("output_size should be int, or list, tuple of ints")
op_type = 'conv3d_transpose'
data_format_ = "NHWC" if channel_last else "NCHW"
if in_dygraph_mode():
attrs = ('output_size', output_size, 'paddings', padding,
"padding_algorithm", padding_algorithm, 'strides', stride,
'dilations', dilation, 'groups', groups, 'use_cudnn',
use_cudnn, "data_format", data_format_)
pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
if bias is not None:
pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
else:
pre_act = pre_bias
out = dygraph_utils._append_activation_in_dygraph(
pre_act, act, use_cudnn=use_cudnn)
else:
inputs = {'Input': [input], 'Filter': [weight]}
attrs = {
'output_size': output_size,
'paddings': padding,
"padding_algorithm": padding_algorithm,
'strides': stride,
'dilations': dilation,
'groups': groups,
'use_cudnn': use_cudnn,
"data_format": data_format_
}
helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype()
check_variable_and_dtype(input, 'input',
['float16', 'float32', 'float64'], 'conv3d')
pre_bias = helper.create_variable_for_type_inference(dtype)
outputs = {"Output": [pre_bias]}
helper.append_op(
type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
if bias is not None:
pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
else:
pre_act = pre_bias
out = helper.append_activation(pre_act)
return out
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import os
import time
import sys
import math
import numpy as np
import argparse
import paddle
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid import framework
class ConvBNLayer(fluid.dygraph.Layer):
def __init__(self,
num_channels,
filter_size,
num_filters,
stride,
padding,
channels=None,
num_groups=1,
act='relu',
use_cudnn=True,
name=None):
super(ConvBNLayer, self).__init__()
self._conv = Conv2D(
num_channels=num_channels,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=padding,
groups=num_groups,
act=None,
use_cudnn=use_cudnn,
param_attr=ParamAttr(
initializer=MSRA(), name=self.full_name() + "_weights"),
bias_attr=False)
self._batch_norm = BatchNorm(
num_filters,
act=act,
param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
moving_mean_name=self.full_name() + "_bn" + '_mean',
moving_variance_name=self.full_name() + "_bn" + '_variance')
def forward(self, inputs):
y = self._conv(inputs)
y = self._batch_norm(y)
return y
class DepthwiseSeparable(fluid.dygraph.Layer):
def __init__(self,
num_channels,
num_filters1,
num_filters2,
num_groups,
stride,
scale,
name=None):
super(DepthwiseSeparable, self).__init__()
self._depthwise_conv = ConvBNLayer(
num_channels=num_channels,
num_filters=int(num_filters1 * scale),
filter_size=3,
stride=stride,
padding=1,
num_groups=int(num_groups * scale),
use_cudnn=False)
self._pointwise_conv = ConvBNLayer(
num_channels=int(num_filters1 * scale),
filter_size=1,
num_filters=int(num_filters2 * scale),
stride=1,
padding=0)
def forward(self, inputs):
y = self._depthwise_conv(inputs)
y = self._pointwise_conv(y)
return y
class MobileNetV1(fluid.dygraph.Layer):
def __init__(self, scale=1.0, class_dim=1000):
super(MobileNetV1, self).__init__()
self.scale = scale
self.dwsl = []
self.conv1 = ConvBNLayer(
num_channels=3,
filter_size=3,
channels=3,
num_filters=int(32 * scale),
stride=2,
padding=1)
dws21 = self.add_sublayer(
sublayer=DepthwiseSeparable(
num_channels=int(32 * scale),
num_filters1=32,
num_filters2=64,
num_groups=32,
stride=1,
scale=scale),
name="conv2_1")
self.dwsl.append(dws21)
dws22 = self.add_sublayer(
sublayer=DepthwiseSeparable(
num_channels=int(64 * scale),
num_filters1=64,
num_filters2=128,
num_groups=64,
stride=2,
scale=scale),
name="conv2_2")
self.dwsl.append(dws22)
dws31 = self.add_sublayer(
sublayer=DepthwiseSeparable(
num_channels=int(128 * scale),
num_filters1=128,
num_filters2=128,
num_groups=128,
stride=1,
scale=scale),
name="conv3_1")
self.dwsl.append(dws31)
dws32 = self.add_sublayer(
sublayer=DepthwiseSeparable(
num_channels=int(128 * scale),
num_filters1=128,
num_filters2=256,
num_groups=128,
stride=2,
scale=scale),
name="conv3_2")
self.dwsl.append(dws32)
dws41 = self.add_sublayer(
sublayer=DepthwiseSeparable(
num_channels=int(256 * scale),
num_filters1=256,
num_filters2=256,
num_groups=256,
stride=1,
scale=scale),
name="conv4_1")
self.dwsl.append(dws41)
dws42 = self.add_sublayer(
sublayer=DepthwiseSeparable(
num_channels=int(256 * scale),
num_filters1=256,
num_filters2=512,
num_groups=256,
stride=2,
scale=scale),
name="conv4_2")
self.dwsl.append(dws42)
for i in range(5):
tmp = self.add_sublayer(
sublayer=DepthwiseSeparable(
num_channels=int(512 * scale),
num_filters1=512,
num_filters2=512,
num_groups=512,
stride=1,
scale=scale),
name="conv5_" + str(i + 1))
self.dwsl.append(tmp)
dws56 = self.add_sublayer(
sublayer=DepthwiseSeparable(
num_channels=int(512 * scale),
num_filters1=512,
num_filters2=1024,
num_groups=512,
stride=2,
scale=scale),
name="conv5_6")
self.dwsl.append(dws56)
dws6 = self.add_sublayer(
sublayer=DepthwiseSeparable(
num_channels=int(1024 * scale),
num_filters1=1024,
num_filters2=1024,
num_groups=1024,
stride=1,
scale=scale),
name="conv6")
self.dwsl.append(dws6)
self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
self.out = Linear(
int(1024 * scale),
class_dim,
param_attr=ParamAttr(
initializer=MSRA(), name=self.full_name() + "fc7_weights"),
bias_attr=ParamAttr(name="fc7_offset"))
def forward(self, inputs):
y = self.conv1(inputs)
for dws in self.dwsl:
y = dws(y)
y = self.pool2d_avg(y)
y = fluid.layers.reshape(y, shape=[-1, 1024])
y = self.out(y)
return y
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import os
import time
import math
import sys
import numpy as np
import argparse
import paddle
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid import framework
class ConvBNLayer(fluid.dygraph.Layer):
def __init__(self,
num_channels,
filter_size,
num_filters,
stride,
padding,
channels=None,
num_groups=1,
use_cudnn=True):
super(ConvBNLayer, self).__init__()
tmp_param = ParamAttr(name=self.full_name() + "_weights")
self._conv = Conv2D(
num_channels=num_channels,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=padding,
groups=num_groups,
act=None,
use_cudnn=use_cudnn,
param_attr=tmp_param,
bias_attr=False)
self._batch_norm = BatchNorm(
num_filters,
param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
moving_mean_name=self.full_name() + "_bn" + '_mean',
moving_variance_name=self.full_name() + "_bn" + '_variance')
def forward(self, inputs, if_act=True):
y = self._conv(inputs)
y = self._batch_norm(y)
if if_act:
y = fluid.layers.relu6(y)
return y
class InvertedResidualUnit(fluid.dygraph.Layer):
def __init__(
self,
num_channels,
num_in_filter,
num_filters,
stride,
filter_size,
padding,
expansion_factor, ):
super(InvertedResidualUnit, self).__init__()
num_expfilter = int(round(num_in_filter * expansion_factor))
self._expand_conv = ConvBNLayer(
num_channels=num_channels,
num_filters=num_expfilter,
filter_size=1,
stride=1,
padding=0,
num_groups=1)
self._bottleneck_conv = ConvBNLayer(
num_channels=num_expfilter,
num_filters=num_expfilter,
filter_size=filter_size,
stride=stride,
padding=padding,
num_groups=num_expfilter,
use_cudnn=False)
self._linear_conv = ConvBNLayer(
num_channels=num_expfilter,
num_filters=num_filters,
filter_size=1,
stride=1,
padding=0,
num_groups=1)
def forward(self, inputs, ifshortcut):
y = self._expand_conv(inputs, if_act=True)
y = self._bottleneck_conv(y, if_act=True)
y = self._linear_conv(y, if_act=False)
if ifshortcut:
y = fluid.layers.elementwise_add(inputs, y)
return y
class InvresiBlocks(fluid.dygraph.Layer):
def __init__(self, in_c, t, c, n, s):
super(InvresiBlocks, self).__init__()
self._first_block = InvertedResidualUnit(
num_channels=in_c,
num_in_filter=in_c,
num_filters=c,
stride=s,
filter_size=3,
padding=1,
expansion_factor=t)
self._inv_blocks = []
for i in range(1, n):
tmp = self.add_sublayer(
sublayer=InvertedResidualUnit(
num_channels=c,
num_in_filter=c,
num_filters=c,
stride=1,
filter_size=3,
padding=1,
expansion_factor=t),
name=self.full_name() + "_" + str(i + 1))
self._inv_blocks.append(tmp)
def forward(self, inputs):
y = self._first_block(inputs, ifshortcut=False)
for inv_block in self._inv_blocks:
y = inv_block(y, ifshortcut=True)
return y
class MobileNetV2(fluid.dygraph.Layer):
def __init__(self, class_dim=1000, scale=1.0):
super(MobileNetV2, self).__init__()
self.scale = scale
self.class_dim = class_dim
bottleneck_params_list = [
(1, 16, 1, 1),
(6, 24, 2, 2),
(6, 32, 3, 2),
(6, 64, 4, 2),
(6, 96, 3, 1),
(6, 160, 3, 2),
(6, 320, 1, 1),
]
#1. conv1
self._conv1 = ConvBNLayer(
num_channels=3,
num_filters=int(32 * scale),
filter_size=3,
stride=2,
padding=1)
#2. bottleneck sequences
self._invl = []
i = 1
in_c = int(32 * scale)
for layer_setting in bottleneck_params_list:
t, c, n, s = layer_setting
i += 1
tmp = self.add_sublayer(
sublayer=InvresiBlocks(
in_c=in_c, t=t, c=int(c * scale), n=n, s=s),
name='conv' + str(i))
self._invl.append(tmp)
in_c = int(c * scale)
#3. last_conv
self._out_c = int(1280 * scale) if scale > 1.0 else 1280
self._conv9 = ConvBNLayer(
num_channels=in_c,
num_filters=self._out_c,
filter_size=1,
stride=1,
padding=0)
#4. pool
self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
#5. fc
tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
self._fc = Linear(
self._out_c,
class_dim,
param_attr=tmp_param,
bias_attr=ParamAttr(name="fc10_offset"))
def forward(self, inputs):
y = self._conv1(inputs, if_act=True)
for inv in self._invl:
y = inv(y)
y = self._conv9(y, if_act=True)
y = self._pool2d_avg(y)
y = fluid.layers.reshape(y, shape=[-1, self._out_c])
y = self._fc(y)
return y
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
def make_divisible(v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class MobileNetV3_dy(fluid.dygraph.Layer):
def __init__(self, scale=1.0, model_name='small', class_dim=1000):
super(MobileNetV3_dy, self).__init__()
inplanes = 16
if model_name == "large":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, False, 'relu', 1],
[3, 64, 24, False, 'relu', 2],
[3, 72, 24, False, 'relu', 1],
[5, 72, 40, True, 'relu', 2],
[5, 120, 40, True, 'relu', 1],
[5, 120, 40, True, 'relu', 1],
[3, 240, 80, False, 'hard_swish', 2],
[3, 200, 80, False, 'hard_swish', 1],
[3, 184, 80, False, 'hard_swish', 1],
[3, 184, 80, False, 'hard_swish', 1],
[3, 480, 112, True, 'hard_swish', 1],
[3, 672, 112, True, 'hard_swish', 1],
[5, 672, 160, True, 'hard_swish', 2],
[5, 960, 160, True, 'hard_swish', 1],
[5, 960, 160, True, 'hard_swish', 1],
]
self.cls_ch_squeeze = 960
self.cls_ch_expand = 1280
elif model_name == "small":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, True, 'relu', 2],
[3, 72, 24, False, 'relu', 2],
[3, 88, 24, False, 'relu', 1],
[5, 96, 40, True, 'hard_swish', 2],
[5, 240, 40, True, 'hard_swish', 1],
[5, 240, 40, True, 'hard_swish', 1],
[5, 120, 48, True, 'hard_swish', 1],
[5, 144, 48, True, 'hard_swish', 1],
[5, 288, 96, True, 'hard_swish', 2],
[5, 576, 96, True, 'hard_swish', 1],
[5, 576, 96, True, 'hard_swish', 1],
]
self.cls_ch_squeeze = 576
self.cls_ch_expand = 1280
else:
raise NotImplementedError("mode[" + model_name +
"_model] is not implemented!")
# network
# conv1
self.conv1 = ConvBnLayer(
in_c=3,
out_c=make_divisible(inplanes * scale),
filter_size=3,
stride=2,
padding=1,
num_groups=1,
if_act=True,
act='hard_swish',
name=self.full_name() + 'conv1')
# conv_blocks
self.all_blocks = []
i = 0
inplanes = make_divisible(inplanes * scale)
for (k, exp, c, se, nl, s) in self.cfg:
self.all_blocks.append(
ResidualUnit(
in_c=inplanes,
mid_c=make_divisible(scale * exp),
out_c=make_divisible(scale * c),
filter_size=k,
stride=s,
ues_se=se,
act=nl,
name=self.full_name() + 'conv' + str(i + 2)))
self.add_sublayer(
sublayer=self.all_blocks[-1],
name=self.full_name() + 'conv' + str(i + 2))
inplanes = make_divisible(scale * c)
i += 1
# last_second_conv
self.last_second_conv = ConvBnLayer(
in_c=inplanes,
out_c=make_divisible(scale * self.cls_ch_squeeze),
filter_size=1,
stride=1,
padding=0,
num_groups=1,
if_act=True,
act='hard_swish',
name=self.full_name() + 'last_second_conv')
# global_avg_pool
self.pool = fluid.dygraph.Pool2D(
pool_type='avg', global_pooling=True, use_cudnn=False)
# last_conv
self.last_conv = fluid.dygraph.Conv2D(
num_channels=make_divisible(scale * self.cls_ch_squeeze),
num_filters=self.cls_ch_expand,
filter_size=1,
stride=1,
padding=0,
act=None,
param_attr=ParamAttr(
name=self.full_name() + 'last_1x1_conv_weights'),
bias_attr=False)
# fc
self.fc = fluid.dygraph.Linear(
input_dim=self.cls_ch_expand,
output_dim=class_dim,
param_attr=ParamAttr(name=self.full_name() + 'fc_weights'))
def forward(self, inputs, label=None, dropout_prob=0.2):
x = self.conv1(inputs)
for i in range(len(self.all_blocks)):
x = self.all_blocks[i](x)
x = self.last_second_conv(x)
x = self.pool(x)
x = self.last_conv(x)
x = fluid.layers.hard_swish(x)
x = fluid.layers.dropout(x=x, dropout_prob=dropout_prob)
x = fluid.layers.reshape(x, shape=[x.shape[0], x.shape[1]])
x = self.fc(x)
if label:
acc1 = fluid.layers.accuracy(input=x, label=label)
acc5 = fluid.layers.accuracy(input=x, label=label, k=5)
return x, acc1, acc5
return x
class ConvBnLayer(fluid.dygraph.Layer):
def __init__(self,
in_c,
out_c,
filter_size,
stride,
padding,
num_groups=1,
if_act=True,
act=None,
use_cudnn=True,
name=''):
super(ConvBnLayer, self).__init__()
self.if_act = if_act
self.act = act
self.conv = fluid.dygraph.Conv2D(
num_channels=in_c,
num_filters=out_c,
filter_size=filter_size,
stride=stride,
padding=padding,
groups=num_groups,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False,
use_cudnn=use_cudnn,
act=None)
self.bn = fluid.dygraph.BatchNorm(
num_channels=out_c,
act=None,
param_attr=ParamAttr(
name=name + "_bn" + "_scale",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0)),
bias_attr=ParamAttr(
name=name + "_bn" + "_offset",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0)),
moving_mean_name=name + "_bn" + '_mean',
moving_variance_name=name + "_bn" + '_variance')
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.if_act:
if self.act == 'relu':
x = fluid.layers.relu(x)
elif self.act == 'hard_swish':
x = fluid.layers.hard_swish(x)
else:
print('The activation function is selected incorrectly.')
exit()
return x
class ResidualUnit(fluid.dygraph.Layer):
def __init__(self,
in_c,
mid_c,
out_c,
filter_size,
stride,
ues_se,
act=None,
name=''):
super(ResidualUnit, self).__init__()
self.if_shortcut = stride == 1 and in_c == out_c
self.if_se = ues_se
self.expand_conv = ConvBnLayer(
in_c=in_c,
out_c=mid_c,
filter_size=1,
stride=1,
padding=0,
if_act=True,
act=act,
name=name + '_expand')
self.bottleneck_conv = ConvBnLayer(
in_c=mid_c,
out_c=mid_c,
filter_size=filter_size,
stride=stride,
padding=int((filter_size - 1) // 2),
num_groups=mid_c,
if_act=True,
act=act,
name=name + '_depthwise')
if self.if_se:
self.mid_se = SEModule(mid_c, name=name + '_SE')
self.linear_conv = ConvBnLayer(
in_c=mid_c,
out_c=out_c,
filter_size=1,
stride=1,
padding=0,
if_act=False,
act=None,
name=name + '_linear')
def forward(self, inputs):
x = self.expand_conv(inputs)
x = self.bottleneck_conv(x)
if self.if_se:
x = self.mid_se(x)
x = self.linear_conv(x)
if self.if_shortcut:
x = fluid.layers.elementwise_add(inputs, x)
return x
class SEModule(fluid.dygraph.Layer):
def __init__(self, channel, reduction=4, name=''):
super(SEModule, self).__init__()
self.avg_pool = fluid.dygraph.Pool2D(
pool_type='avg', global_pooling=True, use_cudnn=False)
self.conv1 = fluid.dygraph.Conv2D(
num_channels=channel,
num_filters=channel // reduction,
filter_size=1,
stride=1,
padding=0,
act='relu',
param_attr=ParamAttr(name=name + "_weights1"),
bias_attr=ParamAttr(name=name + "_bias1"))
self.conv2 = fluid.dygraph.Conv2D(
num_channels=channel // reduction,
num_filters=channel,
filter_size=1,
stride=1,
padding=0,
act=None,
param_attr=ParamAttr(name=name + "_weights2"),
bias_attr=ParamAttr(name=name + "_bias2"))
def forward(self, inputs):
outputs = self.avg_pool(inputs)
outputs = self.conv1(outputs)
outputs = self.conv2(outputs)
outputs = fluid.layers.hard_sigmoid(outputs, slope=0.2, offset=0.5)
return fluid.layers.elementwise_mul(x=inputs, y=outputs, axis=0)
if __name__ == "__main__":
import numpy as np
place = fluid.CPUPlace()
with fluid.dygraph.guard(place):
model = MobileNetV3_dy(scale=0.6, model_name='large', class_dim=1000)
img = np.random.uniform(0, 255, [8, 3, 224, 224]).astype('float32')
img = fluid.dygraph.to_variable(img)
res = model(img)
print(res.shape)
# out_dygraph, static_layer = fluid.dygraph.TracedLayer.trace(model, inputs=[img])
# out_static_graph = static_layer([img])
# print(len(out_static_graph)) # 1
# print(out_static_graph[0].shape)
# static_layer.save_inference_model(dirname='./saved_infer_model')
# from calc_flops import *
# place = fluid.CPUPlace()
# exe = fluid.Executor(place)
# program, feed_vars, fetch_vars = fluid.io.load_inference_model('./saved_infer_model', exe)
# total_flops_params, is_quantize = summary(program)
# coding:utf-8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
import numpy as np
from dy_models.func_conv import conv2d
#from func_conv import conv2d
import pdb
def make_divisible(v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class OFA_kernel(fluid.dygraph.Layer):
def __init__(self,
scale=1.0,
model_name='large',
token=[],
class_dim=1000,
ofa_mode=None,
trainable_besides_trans=False):
super(OFA_kernel, self).__init__()
# Parsing token
assert len(token) >= 45
self.kernel_size_lis = tuple(token[:20])
self.exp_lis = tuple(token[20:40])
self.depth_lis = tuple(token[40:45])
# Hyperparameter
self.scale = scale
self.inplanes = 16
self.class_dim = class_dim
self.ofa_mode = ofa_mode
self.trainable_besides_trans = trainable_besides_trans
if model_name == "large":
# The search space is the last five digits
self.cfg_channel = (16, 24, 40, 80, 112, 160)
self.cfg_stride = (1, 2, 2, 2, 1, 2)
self.cfg_se = (False, False, True, False, True, True)
self.cfg_act = ('relu', 'relu', 'relu', 'hard_swish', 'hard_swish',
'hard_swish')
self.cls_ch_squeeze = 960
self.cls_ch_expand = 1280
else:
raise NotImplementedError("mode[" + model_name +
"_model] is not implemented!")
# conv1
self.conv1 = DynamicConvBnLayer(
in_c=3,
out_c=make_divisible(self.inplanes * self.scale),
filter_size=3,
stride=2,
padding=1,
num_groups=1,
if_act=True,
act='hard_swish',
ofa_mode=None,
trainable_besides_trans=self.trainable_besides_trans,
name='conv1')
self.inplanes = make_divisible(self.inplanes * self.scale)
# conv2
num_mid_filter = make_divisible(self.inplanes * self.scale)
_num_out_filter = self.cfg_channel[0]
num_out_filter = make_divisible(self.scale * _num_out_filter)
self.conv2 = ResidualUnit(
in_c=self.inplanes,
mid_c=num_out_filter,
out_c=num_out_filter,
filter_size=3,
stride=self.cfg_stride[0],
use_se=self.cfg_se[0],
act=self.cfg_act[0],
ofa_mode=None,
trainable_besides_trans=self.trainable_besides_trans,
name='conv2')
self.inplanes = make_divisible(self.cfg_channel[0] * self.scale)
# conv_blocks
i = 3
self.conv_blocks = []
for depth_id in range(len(self.depth_lis)):
for repeat_time in range(self.depth_lis[depth_id]):
num_mid_filter = make_divisible(
self.scale * _num_out_filter *
self.exp_lis[depth_id * 4 + repeat_time])
_num_out_filter = self.cfg_channel[depth_id + 1]
num_out_filter = make_divisible(self.scale * _num_out_filter)
stride = self.cfg_stride[depth_id +
1] if repeat_time == 0 else 1
self.conv_blocks.append(
ResidualUnit(
in_c=self.inplanes,
mid_c=num_mid_filter,
out_c=num_out_filter,
filter_size=self.kernel_size_lis[depth_id * 4 +
repeat_time],
stride=stride,
use_se=self.cfg_se[depth_id + 1],
act=self.cfg_act[depth_id + 1],
ofa_mode=self.ofa_mode,
trainable_besides_trans=self.trainable_besides_trans,
name='conv' + str(i)))
self.add_sublayer(
sublayer=self.conv_blocks[-1], name='conv' + str(i))
self.inplanes = make_divisible(self.scale *
self.cfg_channel[depth_id + 1])
i += 1
# last_second_conv
self.last_second_conv = DynamicConvBnLayer(
in_c=self.inplanes,
out_c=make_divisible(self.cls_ch_squeeze * self.scale),
filter_size=1,
stride=1,
padding=0,
num_groups=1,
if_act=True,
act='hard_swish',
ofa_mode=None,
trainable_besides_trans=self.trainable_besides_trans,
name='last_second_conv')
# global_avg_pool
self.global_avg_pool = fluid.dygraph.Pool2D(
pool_type='avg', global_pooling=True, use_cudnn=False)
# last_conv
self.last_conv = fluid.dygraph.Conv2D(
num_channels=make_divisible(self.cls_ch_squeeze * self.scale),
num_filters=self.cls_ch_expand,
filter_size=1,
stride=1,
padding=0,
groups=1,
act=None,
param_attr=ParamAttr(
name='last_conv_weights',
trainable=self.trainable_besides_trans),
bias_attr=ParamAttr(
name='last_conv_bias', trainable=self.trainable_besides_trans))
# fc
self.fc = fluid.dygraph.Linear(
input_dim=self.cls_ch_expand,
output_dim=self.class_dim,
param_attr=ParamAttr(
name='fc_weights', trainable=self.trainable_besides_trans),
bias_attr=ParamAttr(
name='fc_bias', trainable=self.trainable_besides_trans))
def forward(self,
inputs,
label=None,
dy_token=[],
dy_trainable=[],
dropout_prob=0.2):
#pdb.set_trace()
x = self.conv1(inputs)
x = self.conv2(x)
if self.ofa_mode == 'kernel':
assert len(dy_token) >= sum(self.depth_lis)
assert len(dy_trainable) >= sum(self.depth_lis)
count = 0
i = -1
for depth_id in range(len(self.depth_lis)):
for repeat_time in range(self.depth_lis[depth_id]):
i += 1
if self.kernel_size_lis[depth_id * 4 +
repeat_time] != dy_token[i]:
count += 1
if count != 1:
# print('WARNING: According to the original paper, only one filter should be transformed in each iteration, but the current iteration transforms {} filter(s).'.format(count))
pass
else:
dy_token = [None for i in range(sum(self.depth_lis))]
dy_trainable = [True for i in range(sum(self.depth_lis))]
for i in range(len(self.conv_blocks)):
x = self.conv_blocks[i](x, dy_token[i], dy_trainable[i])
#pdb.set_trace()
x = self.last_second_conv(x)
x = self.global_avg_pool(x)
x = self.last_conv(x)
x = fluid.layers.hard_swish(x)
x = fluid.layers.dropout(x=x, dropout_prob=dropout_prob)
x = fluid.layers.squeeze(x, axes=[])
x = self.fc(x)
if label:
acc1 = fluid.layers.accuracy(input=x, label=label)
acc5 = fluid.layers.accuracy(input=x, label=label, k=5)
return x, acc1, acc5
return x
class DynamicConvBnLayer(fluid.dygraph.Layer):
def __init__(self,
in_c,
out_c,
filter_size,
stride,
padding,
num_groups=1,
if_act=True,
act=None,
use_cudnn=True,
ofa_mode=None,
trainable_besides_trans=True,
name=''):
super(DynamicConvBnLayer, self).__init__()
self.if_act = if_act
self.act = act
self.filter_size = filter_size
self.stride = stride
self.num_groups = num_groups
self.ofa_mode = ofa_mode
self.trainable_besides_trans = trainable_besides_trans
self.use_cudnn = use_cudnn
self.name = name
self.conv = fluid.dygraph.Conv2D(
num_channels=in_c,
num_filters=out_c,
filter_size=filter_size,
stride=stride,
padding=padding,
groups=num_groups,
param_attr=ParamAttr(
trainable=self.trainable_besides_trans, name=name + "_weights"),
bias_attr=False,
use_cudnn=use_cudnn,
act=None)
self.bn = fluid.dygraph.BatchNorm(
num_channels=out_c,
act=None,
param_attr=ParamAttr(
name=name + "_bn" + "_scale",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0),
trainable=self.trainable_besides_trans,
learning_rate=1.0 if self.trainable_besides_trans else 0.0),
bias_attr=ParamAttr(
name=name + "_bn" + "_offset",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0),
trainable=self.trainable_besides_trans,
learning_rate=1.0 if self.trainable_besides_trans else 0.0),
moving_mean_name=name + "_bn" + '_mean',
moving_variance_name=name + "_bn" + '_variance',
use_global_stats=True)
if not self.ofa_mode:
pass
elif self.ofa_mode == 'kernel':
if num_groups == 1:
raise RuntimeError(
'OFA only supports depthwise convolution for kernel transformation operations.'
)
self.trans_block = []
if filter_size >= 5:
_init_np_array = np.eye(9)
self.trans_block.append(
fluid.dygraph.Linear(
input_dim=9,
output_dim=9,
param_attr=ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
_init_np_array),
regularizer=fluid.regularizer.L2Decay(0.0),
name=name + "_transLinear_5to3"),
bias_attr=False))
self.add_sublayer(
sublayer=self.trans_block[-1],
name=name + "_transLinear_5to3")
self.bn_3x3 = fluid.dygraph.BatchNorm(
num_channels=out_c,
act=None,
param_attr=ParamAttr(
name=name + "_bn_3x3" + "_scale",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0)),
bias_attr=ParamAttr(
name=name + "_bn_3x3" + "_offset",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0)),
moving_mean_name=name + "_bn_3x3" + '_mean',
moving_variance_name=name + "_bn_3x3" + '_variance')
if filter_size >= 7:
_init_np_array = np.eye(25)
self.trans_block.insert(
0,
fluid.dygraph.Linear(
input_dim=25,
output_dim=25,
param_attr=ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
_init_np_array),
regularizer=fluid.regularizer.L2Decay(0.0),
name=name + "_transLinear_7to5"),
bias_attr=False))
self.add_sublayer(
sublayer=self.trans_block[0],
name=name + "_transLinear_7to5")
self.bn_5x5 = fluid.dygraph.BatchNorm(
num_channels=out_c,
act=None,
param_attr=ParamAttr(
name=name + "_bn_5x5" + "_scale",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0)),
bias_attr=ParamAttr(
name=name + "_bn_5x5" + "_offset",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0)),
moving_mean_name=name + "_bn_5x5" + '_mean',
moving_variance_name=name + "_bn_5x5" + '_variance')
else:
raise NotImplementedError("OFA_mode [" + ofa_mode +
"] is not implemented!")
def forward(self, x, dy_filter_size=None, dy_trans_trainable=True):
if self.ofa_mode == 'kernel':
for i in range(len(self.trans_block)):
self.trans_block[i].weight.trainable = dy_trans_trainable
# print(self.trans_block[0].weight.trainable)
assert dy_filter_size in [3, 5, 7, None]
if dy_filter_size is None or dy_filter_size == self.filter_size:
x = self.conv(x)
elif dy_filter_size > self.filter_size:
raise RuntimeError(
'The new filter size should be less than or equal to the size of the original model.'
)
else:
kernel_weight_np = self.conv.weight.numpy().copy()
kernel_weight = fluid.dygraph.to_variable(
kernel_weight_np, zero_copy=False)
kernel_weight.stop_gradient = False
_batch_size, _channel = kernel_weight.shape[
0], kernel_weight.shape[1]
i = 0
while dy_filter_size < kernel_weight.shape[-1]:
kernel_weight = kernel_weight[:, :, 1:-1, 1:-1]
kernel_weight = fluid.layers.reshape(
kernel_weight, [_batch_size, _channel, -1])
kernel_weight = fluid.layers.reshape(
kernel_weight, [-1, kernel_weight.shape[-1]])
kernel_weight = self.trans_block[i](kernel_weight)
kernel_weight = fluid.layers.reshape(
kernel_weight, [_batch_size, _channel, -1])
_new_filter_size = int(kernel_weight.shape[-1]**0.5)
kernel_weight = fluid.layers.reshape(kernel_weight, [
_batch_size, _channel, _new_filter_size,
_new_filter_size
])
i += 1
x = conv2d(
x,
weight=kernel_weight,
bias=None,
padding=int((dy_filter_size - 1) // 2),
stride=self.stride,
dilation=1,
groups=self.num_groups,
use_cudnn=self.use_cudnn,
act=None,
data_format='NCHW',
name=self.name + '_transConv')
if self.filter_size == dy_filter_size:
x = self.bn(x)
elif dy_filter_size == 5:
x = self.bn_5x5(x)
elif dy_filter_size == 3:
x = self.bn_3x3(x)
else:
print('ERROR')
exit()
else:
x = self.conv(x)
x = self.bn(x)
if self.if_act:
if self.act == 'relu':
x = fluid.layers.relu(x)
elif self.act == 'hard_swish':
x = fluid.layers.hard_swish(x)
elif self.act == 'mish':
x = x * fluid.layers.tanh(fluid.layers.softplus(x))
else:
print('The activation function is selected incorrectly.')
exit()
return x
class ResidualUnit(fluid.dygraph.Layer):
def __init__(self,
in_c,
mid_c,
out_c,
filter_size,
stride,
use_se,
act=None,
ofa_mode=None,
trainable_besides_trans=True,
name=''):
super(ResidualUnit, self).__init__()
self.if_shortcut = stride == 1 and in_c == out_c
self.if_se = use_se
self.expand_conv = DynamicConvBnLayer(
in_c=in_c,
out_c=mid_c,
filter_size=1,
stride=1,
padding=0,
if_act=True,
act=act,
ofa_mode=None,
trainable_besides_trans=trainable_besides_trans,
name=name + '_expand')
self.bottleneck_conv = DynamicConvBnLayer(
in_c=mid_c,
out_c=mid_c,
filter_size=filter_size,
stride=stride,
padding=int((filter_size - 1) // 2),
num_groups=mid_c,
if_act=True,
act=act,
use_cudnn=False,
ofa_mode=ofa_mode,
trainable_besides_trans=trainable_besides_trans,
name=name + '_depthwise')
if self.if_se:
self.mid_se = SEModule(
mid_c,
trainable_besides_trans=trainable_besides_trans,
name=name + '_SE')
self.linear_conv = DynamicConvBnLayer(
in_c=mid_c,
out_c=out_c,
filter_size=1,
stride=1,
padding=0,
if_act=False,
act=None,
ofa_mode=None,
trainable_besides_trans=trainable_besides_trans,
name=name + '_linear')
def forward(self, inputs, dy_filter_size=None, dy_trans_trainable=True):
x = self.expand_conv(inputs)
x = self.bottleneck_conv(x, dy_filter_size, dy_trans_trainable)
if self.if_se:
x = self.mid_se(x)
x = self.linear_conv(x)
if self.if_shortcut:
x = fluid.layers.elementwise_add(inputs, x)
return x
class SEModule(fluid.dygraph.Layer):
def __init__(self,
channel,
reduction=4,
trainable_besides_trans=True,
name=''):
super(SEModule, self).__init__()
self.avg_pool = fluid.dygraph.Pool2D(
pool_type='avg', global_pooling=True, use_cudnn=False)
self.conv1 = fluid.dygraph.Conv2D(
num_channels=channel,
num_filters=channel // reduction,
filter_size=1,
stride=1,
padding=0,
act='relu',
param_attr=ParamAttr(
name=name + "_weights1", trainable=trainable_besides_trans),
bias_attr=ParamAttr(
name=name + "_bias1", trainable=trainable_besides_trans))
self.conv2 = fluid.dygraph.Conv2D(
num_channels=channel // reduction,
num_filters=channel,
filter_size=1,
stride=1,
padding=0,
act=None,
param_attr=ParamAttr(
name=name + "_weights2", trainable=trainable_besides_trans),
bias_attr=ParamAttr(
name=name + "_bias2", trainable=trainable_besides_trans))
def forward(self, inputs):
outputs = self.avg_pool(inputs)
outputs = self.conv1(outputs)
outputs = self.conv2(outputs)
outputs = fluid.layers.hard_sigmoid(outputs, slope=0.2, offset=0.5)
return fluid.layers.elementwise_mul(x=inputs, y=outputs, axis=0)
if __name__ == "__main__":
# from calc_flops import summary
import numpy as np
place = fluid.CPUPlace()
with fluid.dygraph.guard(place):
model = OFA_kernel(
scale=1.0,
model_name='large',
token=[
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3,
3, 3, 3, 3, 4, 3, 4, 3, 6, 4, 4, 4, 4, 6, 3, 6, 4, 6, 3, 2, 3,
3, 3, 4
],
class_dim=1000,
ofa_mode='kernel',
trainable_besides_trans=False)
img = np.random.uniform(0, 255, [8, 3, 224, 224]).astype('float32')
img = fluid.dygraph.to_variable(img)
token = [7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 7, 5, 7]
trainable = [
False, False, False, False, False, False, False, False, False,
False, False, True, False, False, False
]
res = model(img, dy_token=token, dy_trainable=trainable)
print(res.shape)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import sys
import os
import math
import random
import functools
import numpy as np
import cv2
import paddle
from paddle import fluid
from utils.autoaugment import ImageNetPolicy
from PIL import Image
policy = None
random.seed(0)
np.random.seed(0)
def rotate_image(img):
"""rotate image
Args:
img: image data
Returns:
rotated image data
"""
(h, w) = img.shape[:2]
center = (w / 2, h / 2)
angle = np.random.randint(-10, 11)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(img, M, (w, h))
return rotated
def random_crop(img, size, settings, scale=None, ratio=None,
interpolation=None):
"""random crop image
Args:
img: image data
size: crop size
settings: arguments
scale: scale parameter
ratio: ratio parameter
Returns:
random cropped image data
"""
lower_scale = settings.lower_scale
lower_ratio = settings.lower_ratio
upper_ratio = settings.upper_ratio
scale = [lower_scale, 1.0] if scale is None else scale
ratio = [lower_ratio, upper_ratio] if ratio is None else ratio
aspect_ratio = math.sqrt(np.random.uniform(*ratio))
w = 1. * aspect_ratio
h = 1. / aspect_ratio
bound = min((float(img.shape[0]) / img.shape[1]) / (h**2),
(float(img.shape[1]) / img.shape[0]) / (w**2))
scale_max = min(scale[1], bound)
scale_min = min(scale[0], bound)
target_area = img.shape[0] * img.shape[1] * np.random.uniform(scale_min,
scale_max)
target_size = math.sqrt(target_area)
w = int(target_size * w)
h = int(target_size * h)
i = np.random.randint(0, img.shape[0] - h + 1)
j = np.random.randint(0, img.shape[1] - w + 1)
img = img[i:i + h, j:j + w, :]
if interpolation:
resized = cv2.resize(img, (size, size), interpolation=interpolation)
else:
resized = cv2.resize(img, (size, size))
return resized
#NOTE:(2019/08/08) distort color func is not implemented
def distort_color(img):
"""distort image color
Args:
img: image data
Returns:
distorted color image data
"""
return img
def resize_short(img, target_size, interpolation=None):
"""resize image
Args:
img: image data
target_size: resize short target size
interpolation: interpolation mode
Returns:
resized image data
"""
percent = float(target_size) / min(img.shape[0], img.shape[1])
resized_width = int(round(img.shape[1] * percent))
resized_height = int(round(img.shape[0] * percent))
if interpolation:
resized = cv2.resize(
img, (resized_width, resized_height), interpolation=interpolation)
else:
resized = cv2.resize(img, (resized_width, resized_height))
return resized
def crop_image(img, target_size, center):
"""crop image
Args:
img: images data
target_size: crop target size
center: crop mode
Returns:
img: cropped image data
"""
height, width = img.shape[:2]
size = target_size
if center == True:
w_start = (width - size) // 2
h_start = (height - size) // 2
else:
w_start = np.random.randint(0, width - size + 1)
h_start = np.random.randint(0, height - size + 1)
w_end = w_start + size
h_end = h_start + size
img = img[h_start:h_end, w_start:w_end, :]
return img
def create_mixup_reader(settings, rd):
"""
"""
class context:
tmp_mix = []
tmp_l1 = []
tmp_l2 = []
tmp_lam = []
alpha = settings.mixup_alpha
def fetch_data():
for item in rd():
yield item
def mixup_data():
for data_list in fetch_data():
if alpha > 0.:
lam = np.random.beta(alpha, alpha)
else:
lam = 1.
l1 = np.array(data_list)
l2 = np.random.permutation(l1)
mixed_l = [
l1[i][0] * lam + (1 - lam) * l2[i][0] for i in range(len(l1))
]
yield (mixed_l, l1, l2, lam)
def mixup_reader():
for context.tmp_mix, context.tmp_l1, context.tmp_l2, context.tmp_lam in mixup_data(
):
for i in range(len(context.tmp_mix)):
mixed_l = context.tmp_mix[i]
l1 = context.tmp_l1[i]
l2 = context.tmp_l2[i]
lam = context.tmp_lam
yield (mixed_l, int(l1[1]), int(l2[1]), float(lam))
return mixup_reader
def process_image(sample, settings, mode, color_jitter, rotate):
""" process_image """
mean = settings.image_mean
std = settings.image_std
crop_size = settings.crop_size
img_path = sample[0]
img = cv2.imread(img_path)
if mode == 'train':
if rotate:
img = rotate_image(img)
if crop_size > 0:
img = random_crop(
img, crop_size, settings, interpolation=settings.interpolation)
if color_jitter:
img = distort_color(img)
if np.random.randint(0, 2) == 1:
img = img[:, ::-1, :]
else:
if crop_size > 0:
target_size = settings.resize_short_size
img = resize_short(
img, target_size, interpolation=settings.interpolation)
img = crop_image(img, target_size=crop_size, center=True)
img = img[:, :, ::-1]
if 'use_aa' in settings and settings.use_aa and mode == 'train':
img = np.ascontiguousarray(img)
img = Image.fromarray(img)
img = policy(img)
img = np.asarray(img)
img = img.astype('float32').transpose((2, 0, 1)) / 255
img_mean = np.array(mean).reshape((3, 1, 1))
img_std = np.array(std).reshape((3, 1, 1))
img -= img_mean
img /= img_std
if mode == 'train' or mode == 'val':
return (img, sample[1])
elif mode == 'test':
return (img, )
def process_batch_data(input_data, settings, mode, color_jitter, rotate):
batch_data = []
for sample in input_data:
if os.path.isfile(sample[0]):
batch_data.append(
process_image(sample, settings, mode, color_jitter, rotate))
else:
print("File not exist : %s" % sample[0])
return batch_data
class ImageNetReader:
def __init__(self, seed=None, place_num=1):
self.shuffle_seed = seed
self.place_num = place_num
def set_shuffle_seed(self, seed):
assert isinstance(seed, int), "shuffle seed must be int"
self.shuffle_seed = seed
def _reader_creator(self,
settings,
file_list,
mode,
shuffle=False,
color_jitter=False,
rotate=False,
data_dir=None):
num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
if mode == 'test':
batch_size = 1
else:
batch_size = settings.batch_size / self.place_num
def reader():
def read_file_list():
with open(file_list) as flist:
full_lines = [line.strip() for line in flist]
if mode != "test" and len(full_lines) < settings.batch_size:
print(
"Warning: The number of the whole data ({}) is smaller than the batch_size ({}), and drop_last is turnning on, so nothing will feed in program, Terminated now. Please reset batch_size to a smaller number or feed more data!"
.format(len(full_lines), settings.batch_size))
os._exit(1)
if num_trainers > 1 and mode == "train":
assert self.shuffle_seed is not None, "multiprocess train, shuffle seed must be set!"
np.random.RandomState(self.shuffle_seed).shuffle(
full_lines)
elif shuffle:
assert self.shuffle_seed is not None, "multiprocess train, shuffle seed must be set!"
np.random.RandomState(self.shuffle_seed).shuffle(
full_lines)
batch_data = []
for line in full_lines:
img_path, label = line.split()
img_path = os.path.join(data_dir, img_path)
batch_data.append([img_path, int(label)])
if len(batch_data) == batch_size:
if mode == 'train' or mode == 'val' or mode == 'test':
yield batch_data
batch_data = []
return read_file_list
data_reader = reader()
if mode == 'train' and num_trainers > 1:
assert self.shuffle_seed is not None, \
"If num_trainers > 1, the shuffle_seed must be set, because " \
"the order of batch data generated by reader " \
"must be the same in the respective processes."
data_reader = paddle.fluid.contrib.reader.distributed_batch_reader(
data_reader)
mapper = functools.partial(
process_batch_data,
settings=settings,
mode=mode,
color_jitter=color_jitter,
rotate=rotate)
ret = fluid.io.xmap_readers(
mapper,
data_reader,
settings.reader_thread,
settings.reader_buf_size,
order=False)
return ret
def train(self, settings):
"""Create a reader for trainning
Args:
settings: arguments
Returns:
train reader
"""
file_list = os.path.join(settings.data_dir, 'train_list.txt')
assert os.path.isfile(
file_list), "{} doesn't exist, please check data list path".format(
file_list)
if 'use_aa' in settings and settings.use_aa:
global policy
policy = ImageNetPolicy()
reader = self._reader_creator(
settings,
file_list,
'train',
shuffle=True,
color_jitter=False,
rotate=False,
data_dir=settings.data_dir)
if settings.use_mixup == True:
reader = create_mixup_reader(settings, reader)
reader = fluid.io.batch(
reader,
batch_size=int(settings.batch_size / self.place_num),
drop_last=True)
return reader
def val(self, settings):
"""Create a reader for eval
Args:
settings: arguments
Returns:
eval reader
"""
file_list = os.path.join(settings.data_dir, 'val_list.txt')
assert os.path.isfile(
file_list), "{} doesn't exist, please check data list path".format(
file_list)
return self._reader_creator(
settings,
file_list,
'val',
shuffle=False,
data_dir=settings.data_dir)
def test(self, settings):
"""Create a reader for testing
Args:
settings: arguments
Returns:
test reader
"""
file_list = os.path.join(settings.data_dir, 'val_list.txt')
assert os.path.isfile(
file_list), "{} doesn't exist, please check data list path".format(
file_list)
return self._reader_creator(
settings,
file_list,
'test',
shuffle=False,
data_dir=settings.data_dir)
export PYTHONPATH=$PWD:$PYTHONPATH
python -m paddle.distributed.launch --log_dir my_log train.py \
--use_data_parallel=True \
--batch_size=2048 \
--lr=1e-3 \
--l2_decay=3e-5 \
--total_images=1281167 \
--class_dim=1000 \
--image_shape=3,224,224 \
--model_save_dir=output.ofa.kernel \
--lr_strategy=piecewise_decay \
--num_epochs=360 \
--data_dir=./data/ILSVRC2012 \
--model=once_for_all_kernel \
--use_aa=False \
--checkpoint=./_ofa_epoch359
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import sys
import math
import argparse
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid import framework
import reader
from utils import *
from dy_models import MobileNetV1, MobileNetV2, MobileNetV3_dy, OFA_kernel
args = parse_args()
if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
print_arguments(args)
def dy_token_gene(ori_token, alter=[3, 5, 7], num_of_changes=1):
kernel_token = tuple(ori_token[:20])
exp_token = tuple(ori_token[20:40])
depth_token = tuple(ori_token[40:45])
dy_token = []
for depth_id in range(len(depth_token)):
for repeat_time in range(depth_token[depth_id]):
dy_token.append(kernel_token[depth_id * 4 + repeat_time])
idx_lis = np.random.choice(len(dy_token), num_of_changes, replace=False)
for idx in idx_lis:
while True:
tmp_kernel = alter[np.random.choice(len(alter))]
if tmp_kernel != dy_token[idx]:
dy_token[idx] = tmp_kernel
break
return dy_token
def dy_token_gene_v2(ori_token, alter=[3, 5, 7], position_from_end=-1):
kernel_token = tuple(ori_token[:20])
exp_token = tuple(ori_token[20:40])
depth_token = tuple(ori_token[40:45])
dy_token = []
dy_trainable = []
for depth_id in range(len(depth_token)):
for repeat_time in range(depth_token[depth_id]):
dy_token.append(kernel_token[depth_id * 4 + repeat_time])
dy_trainable.append(False)
dy_token[position_from_end] = alter[0]
dy_trainable[position_from_end] = True
for i in range(abs(position_from_end) - 1):
dy_token[position_from_end + i + 1] = alter[np.random.choice(
len(alter))]
return dy_token, dy_trainable
def eval(net, test_data_loader, eop, position_from_end=1):
total_loss = 0.0
total_acc1 = 0.0
total_acc5 = 0.0
total_sample = 0
t_last = 0
token = [
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3,
3, 4, 3, 4, 3, 6, 4, 4, 4, 4, 6, 3, 6, 4, 6, 3, 2, 3, 3, 3, 4
]
place_num = paddle.fluid.core.get_cuda_device_count(
) if args.use_gpu else int(os.environ.get('CPU_NUM', 1))
for i, (img, label) in enumerate(test_data_loader()):
t1 = time.time()
label = to_variable(label.numpy().astype('int64').reshape(
int(args.batch_size // place_num), 1))
if args.model == "once_for_all_kernel":
dy_token, dy_trainable = dy_token_gene_v2(
token, alter=[5, 7], position_from_end=position_from_end)
print(dy_token)
out = net(img, dy_token=dy_token, dy_trainable=dy_trainable)
else:
out = net(img)
softmax_out = fluid.layers.softmax(out, use_cudnn=False)
loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
avg_loss = fluid.layers.mean(x=loss)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
t2 = time.time()
print( "test | epoch id: %d, batch id: %d, avg_loss %0.5f acc_top1 %0.5f acc_top5 %0.5f %2.4f sec read_t:%2.4f" % \
(eop, i, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), t2 - t1 , t1 - t_last))
sys.stdout.flush()
total_loss += avg_loss.numpy()
total_acc1 += acc_top1.numpy()
total_acc5 += acc_top5.numpy()
total_sample += 1
t_last = time.time()
print("final eval loss %0.3f acc1 %0.3f acc5 %0.3f" % \
(total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample))
sys.stdout.flush()
def train_mobilenet():
if not args.use_gpu:
place = fluid.CPUPlace()
elif not args.use_data_parallel:
place = fluid.CUDAPlace(0)
else:
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
with fluid.dygraph.guard(place):
# 1. init net and optimizer
place_num = paddle.fluid.core.get_cuda_device_count(
) if args.use_gpu else int(os.environ.get('CPU_NUM', 1))
if args.ce:
print("ce mode")
seed = 33
np.random.seed(seed)
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
if args.use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
if args.model == "MobileNetV1":
net = MobileNetV1(class_dim=args.class_dim, scale=1.0)
model_path_pre = 'mobilenet_v1'
elif args.model == "MobileNetV2":
net = MobileNetV2(class_dim=args.class_dim, scale=1.0)
model_path_pre = 'mobilenet_v2'
elif args.model == "MobileNetV3":
net = MobileNetV3_dy(
class_dim=args.class_dim, scale=1.0, model_name='large')
model_path_pre = 'mobilenet_v3'
elif args.model == "once_for_all":
token = [
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3,
3, 3, 3, 3, 4, 3, 4, 3, 6, 4, 4, 4, 4, 6, 3, 6, 4, 6, 3, 2, 3,
3, 3, 4
]
net = OFA_kernel(
class_dim=args.class_dim,
scale=1.0,
model_name='large',
token=token,
ofa_mode=False)
model_path_pre = 'ofa'
elif args.model == "once_for_all_kernel":
token = [
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3,
3, 3, 3, 3, 4, 3, 4, 3, 6, 4, 4, 4, 4, 6, 3, 6, 4, 6, 3, 2, 3,
3, 3, 4
]
net = OFA_kernel(
class_dim=args.class_dim,
scale=1.0,
model_name='large',
token=token,
ofa_mode='kernel',
trainable_besides_trans=False)
model_path_pre = 'ofa_kernel'
else:
print(
"wrong model name, please try model = MobileNetV1 or MobileNetV2"
)
exit()
optimizer = create_optimizer(args=args, parameter_list=net.parameters())
if args.use_data_parallel:
net = fluid.dygraph.parallel.DataParallel(net, strategy)
# 2. load checkpoint
if args.checkpoint:
assert os.path.exists(args.checkpoint + ".pdparams"), \
"Given dir {}.pdparams not exist.".format(args.checkpoint)
assert os.path.exists(args.checkpoint + ".pdopt"), \
"Given dir {}.pdopt not exist.".format(args.checkpoint)
para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint)
if args.model == "once_for_all_kernel":
inner_state_dict = net.state_dict()
for name, para in inner_state_dict.items():
key_name = name
if key_name in para_dict:
print(key_name)
para.set_value(para_dict[key_name])
else:
net.set_dict(para_dict)
optimizer.set_dict(opti_dict)
# 3. reader
train_data_loader, train_data = utility.create_data_loader(
is_train=True, args=args)
test_data_loader, test_data = utility.create_data_loader(
is_train=False, args=args)
num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
imagenet_reader = reader.ImageNetReader(seed=0, place_num=place_num)
train_reader = imagenet_reader.train(settings=args)
test_reader = imagenet_reader.val(settings=args)
train_data_loader.set_sample_list_generator(train_reader, place)
test_data_loader.set_sample_list_generator(test_reader, place)
with open('./dy_token.txt', 'r+') as f:
lines = f.readlines()
eval_sign = False
if eval_sign:
net.eval()
eval(net, test_data_loader, 1, -1)
exit()
# 4. train loop
total_batch_num = 0 #this is for benchmark
# Hyper-Params
total_epoch_each_layer = 15
fixed_epoch_each_layer = 10
position = 0
count = 0
for eop in range(args.num_epochs):
if num_trainers > 1:
imagenet_reader.set_shuffle_seed(eop + (
args.random_seed if args.random_seed else 0))
net.train()
total_loss = 0.0
total_acc1 = 0.0
total_acc5 = 0.0
total_sample = 0
batch_id = 0
t_last = 0
if eop % total_epoch_each_layer == 0:
position += 1
count = 0
with open(
os.path.join('./token', str(position) + '.txt'),
'r+') as f:
lines = f.readlines()
count += 1
# 4.1 for each batch, call net() , backward(), and minimize()
for idx, (img, label) in enumerate(train_data_loader()):
t1 = time.time()
if args.max_iter and total_batch_num == args.max_iter:
return
label = to_variable(label.numpy().astype('int64').reshape(
int(args.batch_size // place_num), 1))
t_start = time.time()
# 4.1.1 call net()
if args.model == "once_for_all_kernel":
dy_token = [int(x) for x in lines[idx].strip().split(',')]
if count <= fixed_epoch_each_layer:
dy_trainable = [False for _ in range(len(dy_token))]
dy_trainable[0 - position] = True
else:
dy_trainable = [False] * (len(dy_token) - position
) + [True] * position
if idx < 10:
print(dy_token)
print(dy_trainable)
out = net(img, dy_token=dy_token, dy_trainable=dy_trainable)
else:
out = net(img)
t_end = time.time()
softmax_out = fluid.layers.softmax(out, use_cudnn=False)
loss = fluid.layers.cross_entropy(
input=softmax_out, label=label)
avg_loss = fluid.layers.mean(x=loss)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
t_start_back = time.time()
# 4.1.2 call backward()
if args.use_data_parallel:
avg_loss = net.scale_loss(avg_loss)
avg_loss.backward()
net.apply_collective_grads()
else:
avg_loss.backward()
t_end_back = time.time()
# 4.1.3 call minimize()
optimizer.minimize(avg_loss)
net.clear_gradients()
t2 = time.time()
train_batch_elapse = t2 - t1
if batch_id % args.print_step == 0:
print( "epoch id: %d, batch step: %d, avg_loss %0.5f, acc_top1 %0.5f, acc_top5 %0.5f, lr %f, %2.4f sec, net_t:%2.4f, back_t:%2.4f, read_t:%2.4f" % \
(eop, batch_id, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), optimizer._global_learning_rate().numpy(), train_batch_elapse,
t_end - t_start, t_end_back - t_start_back, t1 - t_last))
sys.stdout.flush()
total_loss += avg_loss.numpy()
total_acc1 += acc_top1.numpy()
total_acc5 += acc_top5.numpy()
total_sample += 1
batch_id += 1
t_last = time.time()
# NOTE: used for benchmark
total_batch_num = total_batch_num + 1
if args.ce:
print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample))
print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample))
print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample))
print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \
(eop, batch_id, total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse))
# 4.2 save checkpoint
save_parameters = (not args.use_data_parallel) or (
args.use_data_parallel and
fluid.dygraph.parallel.Env().local_rank == 0)
if save_parameters:
if not os.path.isdir(args.model_save_dir):
os.makedirs(args.model_save_dir)
model_path = os.path.join(
args.model_save_dir,
"_" + model_path_pre + "_epoch{}".format(eop))
fluid.dygraph.save_dygraph(net.state_dict(), model_path)
fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path)
# 4.3 validation
net.eval()
eval(net, test_data_loader, eop, 0 - position)
# 5. save final results
save_parameters = (not args.use_data_parallel) or (
args.use_data_parallel and
fluid.dygraph.parallel.Env().local_rank == 0)
if save_parameters:
model_path = os.path.join(args.model_save_dir,
"_" + model_path_pre + "_final")
fluid.dygraph.save_dygraph(net.state_dict(), model_path)
if __name__ == '__main__':
train_mobilenet()
from .optimizer import Optimizer, create_optimizer
from .utility import add_arguments, print_arguments, parse_args, check_gpu, check_args, check_version, init_model, save_model, create_data_loader, print_info, best_strategy_compiled, init_model, save_model, ExponentialMovingAverage
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
"""
This code is based on https://github.com/DeepVoltaire/AutoAugment/blob/master/autoaugment.py
"""
from PIL import Image, ImageEnhance, ImageOps
import numpy as np
import random
class ImageNetPolicy(object):
""" Randomly choose one of the best 24 Sub-policies on ImageNet.
Example:
>>> policy = ImageNetPolicy()
>>> transformed = policy(image)
Example as a PyTorch Transform:
>>> transform=transforms.Compose([
>>> transforms.Resize(256),
>>> ImageNetPolicy(),
>>> transforms.ToTensor()])
"""
def __init__(self, fillcolor=(128, 128, 128)):
self.policies = [
SubPolicy(0.4, "posterize", 8, 0.6, "rotate", 9, fillcolor),
SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor),
SubPolicy(0.6, "posterize", 7, 0.6, "posterize", 6, fillcolor),
SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
SubPolicy(0.4, "equalize", 4, 0.8, "rotate", 8, fillcolor),
SubPolicy(0.6, "solarize", 3, 0.6, "equalize", 7, fillcolor),
SubPolicy(0.8, "posterize", 5, 1.0, "equalize", 2, fillcolor),
SubPolicy(0.2, "rotate", 3, 0.6, "solarize", 8, fillcolor),
SubPolicy(0.6, "equalize", 8, 0.4, "posterize", 6, fillcolor),
SubPolicy(0.8, "rotate", 8, 0.4, "color", 0, fillcolor),
SubPolicy(0.4, "rotate", 9, 0.6, "equalize", 2, fillcolor),
SubPolicy(0.0, "equalize", 7, 0.8, "equalize", 8, fillcolor),
SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
SubPolicy(0.8, "rotate", 8, 1.0, "color", 2, fillcolor),
SubPolicy(0.8, "color", 8, 0.8, "solarize", 7, fillcolor),
SubPolicy(0.4, "sharpness", 7, 0.6, "invert", 8, fillcolor),
SubPolicy(0.6, "shearX", 5, 1.0, "equalize", 9, fillcolor),
SubPolicy(0.4, "color", 0, 0.6, "equalize", 3, fillcolor),
SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor)
]
def __call__(self, img, policy_idx=None):
if policy_idx is None or not isinstance(policy_idx, int):
policy_idx = random.randint(0, len(self.policies) - 1)
else:
policy_idx = policy_idx % len(self.policies)
return self.policies[policy_idx](img)
def __repr__(self):
return "AutoAugment ImageNet Policy"
class CIFAR10Policy(object):
""" Randomly choose one of the best 25 Sub-policies on CIFAR10.
Example:
>>> policy = CIFAR10Policy()
>>> transformed = policy(image)
Example as a PyTorch Transform:
>>> transform=transforms.Compose([
>>> transforms.Resize(256),
>>> CIFAR10Policy(),
>>> transforms.ToTensor()])
"""
def __init__(self, fillcolor=(128, 128, 128)):
self.policies = [
SubPolicy(0.1, "invert", 7, 0.2, "contrast", 6, fillcolor),
SubPolicy(0.7, "rotate", 2, 0.3, "translateX", 9, fillcolor),
SubPolicy(0.8, "sharpness", 1, 0.9, "sharpness", 3, fillcolor),
SubPolicy(0.5, "shearY", 8, 0.7, "translateY", 9, fillcolor),
SubPolicy(0.5, "autocontrast", 8, 0.9, "equalize", 2, fillcolor),
SubPolicy(0.2, "shearY", 7, 0.3, "posterize", 7, fillcolor),
SubPolicy(0.4, "color", 3, 0.6, "brightness", 7, fillcolor),
SubPolicy(0.3, "sharpness", 9, 0.7, "brightness", 9, fillcolor),
SubPolicy(0.6, "equalize", 5, 0.5, "equalize", 1, fillcolor),
SubPolicy(0.6, "contrast", 7, 0.6, "sharpness", 5, fillcolor),
SubPolicy(0.7, "color", 7, 0.5, "translateX", 8, fillcolor),
SubPolicy(0.3, "equalize", 7, 0.4, "autocontrast", 8, fillcolor),
SubPolicy(0.4, "translateY", 3, 0.2, "sharpness", 6, fillcolor),
SubPolicy(0.9, "brightness", 6, 0.2, "color", 8, fillcolor),
SubPolicy(0.5, "solarize", 2, 0.0, "invert", 3, fillcolor),
SubPolicy(0.2, "equalize", 0, 0.6, "autocontrast", 0, fillcolor),
SubPolicy(0.2, "equalize", 8, 0.8, "equalize", 4, fillcolor),
SubPolicy(0.9, "color", 9, 0.6, "equalize", 6, fillcolor),
SubPolicy(0.8, "autocontrast", 4, 0.2, "solarize", 8, fillcolor),
SubPolicy(0.1, "brightness", 3, 0.7, "color", 0, fillcolor),
SubPolicy(0.4, "solarize", 5, 0.9, "autocontrast", 3, fillcolor),
SubPolicy(0.9, "translateY", 9, 0.7, "translateY", 9, fillcolor),
SubPolicy(0.9, "autocontrast", 2, 0.8, "solarize", 3, fillcolor),
SubPolicy(0.8, "equalize", 8, 0.1, "invert", 3, fillcolor),
SubPolicy(0.7, "translateY", 9, 0.9, "autocontrast", 1, fillcolor)
]
def __call__(self, img, policy_idx=None):
if policy_idx is None or not isinstance(policy_idx, int):
policy_idx = random.randint(0, len(self.policies) - 1)
else:
policy_idx = policy_idx % len(self.policies)
return self.policies[policy_idx](img)
def __repr__(self):
return "AutoAugment CIFAR10 Policy"
class SVHNPolicy(object):
""" Randomly choose one of the best 25 Sub-policies on SVHN.
Example:
>>> policy = SVHNPolicy()
>>> transformed = policy(image)
Example as a PyTorch Transform:
>>> transform=transforms.Compose([
>>> transforms.Resize(256),
>>> SVHNPolicy(),
>>> transforms.ToTensor()])
"""
def __init__(self, fillcolor=(128, 128, 128)):
self.policies = [
SubPolicy(0.9, "shearX", 4, 0.2, "invert", 3, fillcolor),
SubPolicy(0.9, "shearY", 8, 0.7, "invert", 5, fillcolor),
SubPolicy(0.6, "equalize", 5, 0.6, "solarize", 6, fillcolor),
SubPolicy(0.9, "invert", 3, 0.6, "equalize", 3, fillcolor),
SubPolicy(0.6, "equalize", 1, 0.9, "rotate", 3, fillcolor),
SubPolicy(0.9, "shearX", 4, 0.8, "autocontrast", 3, fillcolor),
SubPolicy(0.9, "shearY", 8, 0.4, "invert", 5, fillcolor),
SubPolicy(0.9, "shearY", 5, 0.2, "solarize", 6, fillcolor),
SubPolicy(0.9, "invert", 6, 0.8, "autocontrast", 1, fillcolor),
SubPolicy(0.6, "equalize", 3, 0.9, "rotate", 3, fillcolor),
SubPolicy(0.9, "shearX", 4, 0.3, "solarize", 3, fillcolor),
SubPolicy(0.8, "shearY", 8, 0.7, "invert", 4, fillcolor),
SubPolicy(0.9, "equalize", 5, 0.6, "translateY", 6, fillcolor),
SubPolicy(0.9, "invert", 4, 0.6, "equalize", 7, fillcolor),
SubPolicy(0.3, "contrast", 3, 0.8, "rotate", 4, fillcolor),
SubPolicy(0.8, "invert", 5, 0.0, "translateY", 2, fillcolor),
SubPolicy(0.7, "shearY", 6, 0.4, "solarize", 8, fillcolor),
SubPolicy(0.6, "invert", 4, 0.8, "rotate", 4, fillcolor), SubPolicy(
0.3, "shearY", 7, 0.9, "translateX", 3, fillcolor), SubPolicy(
0.1, "shearX", 6, 0.6, "invert", 5, fillcolor), SubPolicy(
0.7, "solarize", 2, 0.6, "translateY", 7, fillcolor),
SubPolicy(0.8, "shearY", 4, 0.8, "invert", 8, fillcolor), SubPolicy(
0.7, "shearX", 9, 0.8, "translateY", 3, fillcolor), SubPolicy(
0.8, "shearY", 5, 0.7, "autocontrast", 3, fillcolor),
SubPolicy(0.7, "shearX", 2, 0.1, "invert", 5, fillcolor)
]
def __call__(self, img, policy_idx=None):
if policy_idx is None or not isinstance(policy_idx, int):
policy_idx = random.randint(0, len(self.policies) - 1)
else:
policy_idx = policy_idx % len(self.policies)
return self.policies[policy_idx](img)
def __repr__(self):
return "AutoAugment SVHN Policy"
class SubPolicy(object):
def __init__(self,
p1,
operation1,
magnitude_idx1,
p2,
operation2,
magnitude_idx2,
fillcolor=(128, 128, 128)):
ranges = {
"shearX": np.linspace(0, 0.3, 10),
"shearY": np.linspace(0, 0.3, 10),
"translateX": np.linspace(0, 150 / 331, 10),
"translateY": np.linspace(0, 150 / 331, 10),
"rotate": np.linspace(0, 30, 10),
"color": np.linspace(0.0, 0.9, 10),
"posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int),
"solarize": np.linspace(256, 0, 10),
"contrast": np.linspace(0.0, 0.9, 10),
"sharpness": np.linspace(0.0, 0.9, 10),
"brightness": np.linspace(0.0, 0.9, 10),
"autocontrast": [0] * 10,
"equalize": [0] * 10,
"invert": [0] * 10
}
# from https://stackoverflow.com/questions/5252170/specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
def rotate_with_fill(img, magnitude):
rot = img.convert("RGBA").rotate(magnitude)
return Image.composite(rot,
Image.new("RGBA", rot.size, (128, ) * 4),
rot).convert(img.mode)
func = {
"shearX": lambda img, magnitude: img.transform(
img.size, Image.AFFINE, (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
Image.BICUBIC, fillcolor=fillcolor),
"shearY": lambda img, magnitude: img.transform(
img.size, Image.AFFINE, (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
Image.BICUBIC, fillcolor=fillcolor),
"translateX": lambda img, magnitude: img.transform(
img.size, Image.AFFINE, (1, 0, magnitude * img.size[0] * random.choice([-1, 1]), 0, 1, 0),
fillcolor=fillcolor),
"translateY": lambda img, magnitude: img.transform(
img.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude * img.size[1] * random.choice([-1, 1])),
fillcolor=fillcolor),
"rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
# "rotate": lambda img, magnitude: img.rotate(magnitude * random.choice([-1, 1])),
"color": lambda img, magnitude: ImageEnhance.Color(img).enhance(1 + magnitude * random.choice([-1, 1])),
"posterize": lambda img, magnitude: ImageOps.posterize(img, magnitude),
"solarize": lambda img, magnitude: ImageOps.solarize(img, magnitude),
"contrast": lambda img, magnitude: ImageEnhance.Contrast(img).enhance(
1 + magnitude * random.choice([-1, 1])),
"sharpness": lambda img, magnitude: ImageEnhance.Sharpness(img).enhance(
1 + magnitude * random.choice([-1, 1])),
"brightness": lambda img, magnitude: ImageEnhance.Brightness(img).enhance(
1 + magnitude * random.choice([-1, 1])),
"autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
"equalize": lambda img, magnitude: ImageOps.equalize(img),
"invert": lambda img, magnitude: ImageOps.invert(img)
}
self.p1 = p1
self.operation1 = func[operation1]
self.magnitude1 = ranges[operation1][magnitude_idx1]
self.p2 = p2
self.operation2 = func[operation2]
self.magnitude2 = ranges[operation2][magnitude_idx2]
def __call__(self, img):
if random.random() < self.p1:
img = self.operation1(img, self.magnitude1)
if random.random() < self.p2:
img = self.operation2(img, self.magnitude2)
return img
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import paddle.fluid as fluid
def nccl2_prepare(args, startup_prog, main_prog):
config = fluid.DistributeTranspilerConfig()
config.mode = "nccl2"
t = fluid.DistributeTranspiler(config=config)
envs = args.dist_env
t.transpile(
envs["trainer_id"],
trainers=','.join(envs["trainer_endpoints"]),
current_endpoint=envs["current_endpoint"],
startup_program=startup_prog,
program=main_prog)
def pserver_prepare(args, train_prog, startup_prog):
config = fluid.DistributeTranspilerConfig()
config.slice_var_up = args.split_var
t = fluid.DistributeTranspiler(config=config)
envs = args.dist_env
training_role = envs["training_role"]
t.transpile(
envs["trainer_id"],
program=train_prog,
pservers=envs["pserver_endpoints"],
trainers=envs["num_trainers"],
sync_mode=not args.async_mode,
startup_program=startup_prog)
if training_role == "PSERVER":
pserver_program = t.get_pserver_program(envs["current_endpoint"])
pserver_startup_program = t.get_startup_program(
envs["current_endpoint"],
pserver_program,
startup_program=startup_prog)
return pserver_program, pserver_startup_program
elif training_role == "TRAINER":
train_program = t.get_trainer_program()
return train_program, startup_prog
else:
raise ValueError(
'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
)
def nccl2_prepare_paddle(trainer_id, startup_prog, main_prog):
config = fluid.DistributeTranspilerConfig()
config.mode = "nccl2"
t = fluid.DistributeTranspiler(config=config)
t.transpile(
trainer_id,
trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'),
current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'),
startup_program=startup_prog,
program=main_prog)
def prepare_for_multi_process(exe, build_strategy, train_prog):
# prepare for multi-process
trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0))
num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
if num_trainers < 2: return
print("PADDLE_TRAINERS_NUM", num_trainers)
print("PADDLE_TRAINER_ID", trainer_id)
build_strategy.num_trainers = num_trainers
build_strategy.trainer_id = trainer_id
# NOTE(zcd): use multi processes to train the model,
# and each process use one GPU card.
startup_prog = fluid.Program()
nccl2_prepare_paddle(trainer_id, startup_prog, train_prog)
# the startup_prog are run two times, but it doesn't matter.
exe.run(startup_prog)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle.fluid as fluid
import paddle.fluid.layers.ops as ops
from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
class CosineDecayWarmup(LearningRateDecay):
def __init__(self,
learning_rate,
step_each_epoch,
epochs,
warmup_epochs,
begin=0,
step=1,
dtype='float32'):
super(CosineDecayWarmup, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.step_each_epoch = step_each_epoch
self.epochs = epochs
self.warmup_epochs = warmup_epochs
def step(self):
if (self.step_num / self.step_each_epoch) < self.warmup_epochs:
decayed_lr = self.learning_rate * (self.step_num / (
self.step_each_epoch * self.warmup_epochs))
else:
decayed_lr = self.learning_rate * 0.5 * (fluid.layers.cos(
self.create_lr_var((self.step_num - self.warmup_epochs *
self.step_each_epoch) * math.pi /
(self.epochs * self.step_each_epoch))) + 1)
return decayed_lr
class Optimizer(object):
"""A class used to represent several optimizer methods
Attributes:
batch_size: batch size on all devices.
lr: learning rate.
lr_strategy: learning rate decay strategy.
l2_decay: l2_decay parameter.
momentum_rate: momentum rate when using Momentum optimizer.
step_epochs: piecewise decay steps.
num_epochs: number of total epochs.
total_images: total images.
step: total steps in the an epoch.
"""
def __init__(self, args, parameter_list):
self.parameter_list = parameter_list
self.batch_size = args.batch_size
self.lr = args.lr
self.lr_strategy = args.lr_strategy
self.l2_decay = args.l2_decay
self.momentum_rate = args.momentum_rate
self.step_epochs = args.step_epochs
self.num_epochs = args.num_epochs
self.warm_up_epochs = args.warm_up_epochs
self.decay_epochs = args.decay_epochs
self.decay_rate = args.decay_rate
self.total_images = args.total_images
self.step = int(math.ceil(float(self.total_images) / self.batch_size))
def piecewise_decay(self):
"""piecewise decay with Momentum optimizer
Returns:
a piecewise_decay optimizer
"""
bd = [self.step * e for e in self.step_epochs]
lr = [self.lr * (0.1**i) for i in range(len(bd) + 1)]
learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
optimizer = fluid.optimizer.Momentum(
learning_rate=learning_rate,
momentum=self.momentum_rate,
regularization=fluid.regularizer.L2Decay(self.l2_decay),
parameter_list=self.parameter_list)
return optimizer
def cosine_decay(self):
"""cosine decay with Momentum optimizer
Returns:
a cosine_decay optimizer
"""
learning_rate = fluid.layers.cosine_decay(
learning_rate=self.lr,
step_each_epoch=self.step,
epochs=self.num_epochs)
optimizer = fluid.optimizer.Momentum(
learning_rate=learning_rate,
momentum=self.momentum_rate,
regularization=fluid.regularizer.L2Decay(self.l2_decay),
parameter_list=self.parameter_list)
return optimizer
def cosine_decay_warmup(self):
"""cosine decay with warmup
Returns:
a cosine_decay_with_warmup optimizer
"""
learning_rate = CosineDecayWarmup(
learning_rate=self.lr,
step_each_epoch=self.step,
epochs=self.num_epochs,
warmup_epochs=5)
optimizer = fluid.optimizer.Momentum(
learning_rate=learning_rate,
momentum=self.momentum_rate,
regularization=fluid.regularizer.L2Decay(self.l2_decay),
parameter_list=self.parameter_list)
return optimizer
def linear_decay(self):
"""linear decay with Momentum optimizer
Returns:
a linear_decay optimizer
"""
end_lr = 0
learning_rate = fluid.layers.polynomial_decay(
self.lr, self.step, end_lr, power=1)
optimizer = fluid.optimizer.Momentum(
learning_rate=learning_rate,
momentum=self.momentum_rate,
regularization=fluid.regularizer.L2Decay(self.l2_decay),
parameter_list=self.parameter_list)
return optimizer
def adam_decay(self):
"""Adam optimizer
Returns:
an adam_decay optimizer
"""
return fluid.optimizer.Adam(
learning_rate=self.lr, parameter_list=self.parameter_list)
def cosine_decay_RMSProp(self):
"""cosine decay with RMSProp optimizer
Returns:
an cosine_decay_RMSProp optimizer
"""
learning_rate = fluid.layers.cosine_decay(
learning_rate=self.lr,
step_each_epoch=self.step,
epochs=self.num_epochs)
optimizer = fluid.optimizer.RMSProp(
learning_rate=learning_rate,
momentum=self.momentum_rate,
regularization=fluid.regularizer.L2Decay(self.l2_decay),
# Apply epsilon=1 on ImageNet dataset.
epsilon=1,
parameter_list=self.parameter_list)
return optimizer
def default_decay(self):
"""default decay
Returns:
default decay optimizer
"""
optimizer = fluid.optimizer.Momentum(
learning_rate=self.lr,
momentum=self.momentum_rate,
regularization=fluid.regularizer.L2Decay(self.l2_decay),
parameter_list=self.parameter_list)
return optimizer
def create_optimizer(args, parameter_list):
Opt = Optimizer(args, parameter_list)
optimizer = getattr(Opt, args.lr_strategy)()
return optimizer
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import distutils.util
import numpy as np
import six
import argparse
import functools
import logging
import sys
import os
import warnings
import signal
import paddle
import paddle.fluid as fluid
from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
from paddle.fluid.framework import Program, program_guard, name_scope, default_main_program
from paddle.fluid import unique_name, layers
from utils import dist_utils
def print_arguments(args):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
print("------------- Configuration Arguments -------------")
for arg, value in sorted(six.iteritems(vars(args))):
print("%25s : %s" % (arg, value))
print("----------------------------------------------------")
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def parse_args():
"""Add arguments
Returns:
all training args
"""
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('use_data_parallel', bool, False, "The flag indicating whether to use data parallel mode to train the model.")
add_arg('ce', bool, False, "run ce.")
# ENV
add_arg('use_gpu', bool, True, "Whether to use GPU.")
add_arg('model_save_dir', str, "./output", "The directory path to save model.")
add_arg('data_dir', str, "./data/ILSVRC2012/", "The ImageNet dataset root directory.")
#add_arg('data_dir', str, "../../PaddleCV/image_classification/data/", "The ImageNet dataset root directory.")
add_arg('pretrained_model', str, None, "Whether to load pretrained model.")
add_arg('checkpoint', str, './output.ofa.mul/_ofa_epoch60', "Whether to resume checkpoint.")
add_arg('print_step', int, 10, "The steps interval to print logs")
add_arg('save_step', int, 1, "The steps interval to save checkpoints")
# SOLVER AND HYPERPARAMETERS
add_arg('model', str, "once_for_all_kernel", "The name of network.")
add_arg('total_images', int, 1281167, "The number of total training images.")
add_arg('num_epochs', int, 360, "The number of total epochs.")
add_arg('class_dim', int, 1000, "The number of total classes.")
add_arg('image_shape', str, "3,224,224", "The size of Input image, order: [channels, height, weidth] ")
add_arg('batch_size', int, 512, "Minibatch size on a device.")
add_arg('test_batch_size', int, 512, "Test batch size on a deveice.")
add_arg('lr', float, 0.4, "The learning rate.")
add_arg('lr_strategy', str, "cosine_decay", "The learning rate decay strategy.")
add_arg('l2_decay', float, 1e-4, "The l2_decay parameter.")
add_arg('momentum_rate', float, 0.9, "The value of momentum_rate.")
add_arg('warm_up_epochs', float, 5.0, "The value of warm up epochs")
add_arg('decay_epochs', float, 2.4, "Decay epochs of exponential decay learning rate scheduler")
add_arg('decay_rate', float, 0.97, "Decay rate of exponential decay learning rate scheduler")
add_arg('drop_connect_rate', float, 0.2, "The value of drop connect rate")
parser.add_argument('--step_epochs', nargs='+', type=int, default=[360], help="piecewise decay step")
# NOTE: used for benchmark
add_arg('max_iter', int, 0, "The number of total train max_iters.")
# READER AND PREPROCESS
add_arg('lower_scale', float, 0.08, "The value of lower_scale in ramdom_crop")
add_arg('lower_ratio', float, 3./4., "The value of lower_ratio in ramdom_crop")
add_arg('upper_ratio', float, 4./3., "The value of upper_ratio in ramdom_crop")
add_arg('resize_short_size', int, 256, "The value of resize_short_size")
add_arg('crop_size', int, 224, "The value of crop size")
add_arg('use_mixup', bool, False, "Whether to use mixup")
add_arg('mixup_alpha', float, 0.2, "The value of mixup_alpha")
add_arg('reader_thread', int, 8, "The number of multi thread reader")
add_arg('reader_buf_size', int, 16, "The buf size of multi thread reader")
add_arg('interpolation', int, None, "The interpolation mode")
add_arg('use_aa', bool, False, "Whether to use auto augment")
parser.add_argument('--image_mean', nargs='+', type=float, default=[0.485, 0.456, 0.406], help="The mean of input image data")
parser.add_argument('--image_std', nargs='+', type=float, default=[0.229, 0.224, 0.225], help="The std of input image data")
# SWITCH
#NOTE: (2019/08/08) FP16 is moving to PaddlePaddle/Fleet now
#add_arg('use_fp16', bool, False, "Whether to enable half precision training with fp16." )
#add_arg('scale_loss', float, 1.0, "The value of scale_loss for fp16." )
add_arg('use_label_smoothing', bool, False, "Whether to use label_smoothing")
add_arg('label_smoothing_epsilon', float, 0.1, "The value of label_smoothing_epsilon parameter")
#NOTE: (2019/08/08) temporary disable use_distill
#add_arg('use_distill', bool, False, "Whether to use distill")
add_arg('random_seed', int, None, "random seed")
add_arg('use_ema', bool, False, "Whether to use ExponentialMovingAverage.")
add_arg('ema_decay', float, 0.9999, "The value of ema decay rate")
add_arg('padding_type', str, "SAME", "Padding type of convolution")
add_arg('use_se', bool, True, "Whether to use Squeeze-and-Excitation module for EfficientNet.")
# yapf: enable
args = parser.parse_args()
return args
def check_gpu():
"""
Log error and exit when set use_gpu=true in paddlepaddle
cpu ver sion.
"""
logger = logging.getLogger(__name__)
err = "Config use_gpu cannot be set as true while you are " \
"using paddlepaddle cpu version ! \nPlease try: \n" \
"\t1. Install paddlepaddle-gpu to run model on GPU \n" \
"\t2. Set use_gpu as false in config file to run " \
"model on CPU"
try:
if args.use_gpu and not fluid.is_compiled_with_cuda():
print(err)
sys.exit(1)
except Exception as e:
pass
def check_version():
"""
Log error and exit when the installed version of paddlepaddle is
not satisfied.
"""
err = "PaddlePaddle version 1.6 or higher is required, " \
"or a suitable develop version is satisfied as well. \n" \
"Please make sure the version is good with your code." \
try:
fluid.require_version('1.6.0')
except Exception as e:
print(err)
sys.exit(1)
def check_args(args):
"""check arguments before running
Args:
all arguments
"""
# check models name
sys.path.append("..")
import models
model_list = [m for m in dir(models) if "__" not in m]
assert args.model in model_list, "{} is not in lists: {}, please check the model name".format(
args.model, model_list)
# check learning rate strategy
lr_strategy_list = [
"piecewise_decay", "cosine_decay", "linear_decay",
"cosine_decay_warmup", "exponential_decay_warmup"
]
if args.lr_strategy not in lr_strategy_list:
warnings.warn(
"\n{} is not in lists: {}, \nUse default learning strategy now.".
format(args.lr_strategy, lr_strategy_list))
args.lr_strategy = "default_decay"
# check confict of GoogLeNet and mixup
if args.model == "GoogLeNet":
assert args.use_mixup == False, "Cannot use mixup processing in GoogLeNet, please set use_mixup = False."
if args.interpolation:
assert args.interpolation in [
0, 1, 2, 3, 4
], "Wrong interpolation, please set:\n0: cv2.INTER_NEAREST\n1: cv2.INTER_LINEAR\n2: cv2.INTER_CUBIC\n3: cv2.INTER_AREA\n4: cv2.INTER_LANCZOS4"
if args.padding_type:
assert args.padding_type in [
"SAME", "VALID", "DYNAMIC"
], "Wrong padding_type, please set:\nSAME\nVALID\nDYNAMIC"
assert args.checkpoint is None or args.pretrained_model is None, "Do not init model by checkpoint and pretrained_model both."
# check pretrained_model path for loading
if args.pretrained_model is not None:
assert isinstance(args.pretrained_model, str)
assert os.path.isdir(
args.
pretrained_model), "please support available pretrained_model path."
#FIXME: check checkpoint path for saving
if args.checkpoint is not None:
assert isinstance(args.checkpoint, str)
assert os.path.isdir(
args.checkpoint
), "please support available checkpoint path for initing model."
# check params for loading
"""
if args.save_params:
assert isinstance(args.save_params, str)
assert os.path.isdir(
args.save_params), "please support available save_params path."
"""
# check gpu: when using gpu, the number of visible cards should divide batch size
if args.use_gpu:
assert args.batch_size % fluid.core.get_cuda_device_count(
) == 0, "please support correct batch_size({}), which can be divided by available cards({}), you can change the number of cards by indicating: export CUDA_VISIBLE_DEVICES= ".format(
args.batch_size, fluid.core.get_cuda_device_count())
# check data directory
assert os.path.isdir(
args.data_dir
), "Data doesn't exist in {}, please load right path".format(args.data_dir)
#check gpu
check_gpu()
check_version()
def init_model(exe, args, program):
if args.checkpoint:
fluid.io.load_persistables(exe, args.checkpoint, main_program=program)
print("Finish initing model from %s" % (args.checkpoint))
if args.pretrained_model:
def if_exist(var):
return os.path.exists(os.path.join(args.pretrained_model, var.name))
fluid.io.load_vars(
exe,
args.pretrained_model,
main_program=program,
predicate=if_exist)
def save_model(args, exe, train_prog, info):
model_path = os.path.join(args.model_save_dir, args.model, str(info))
if not os.path.isdir(model_path):
os.makedirs(model_path)
fluid.io.save_persistables(exe, model_path, main_program=train_prog)
print("Already save model in %s" % (model_path))
def create_data_loader(is_train, args):
"""create data_loader
Usage:
Using mixup process in training, it will return 5 results, include data_loader, image, y_a(label), y_b(label) and lamda, or it will return 3 results, include data_loader, image, and label.
Args:
is_train: mode
args: arguments
Returns:
data_loader and the input data of net,
"""
image_shape = [int(m) for m in args.image_shape.split(",")]
feed_image = fluid.data(
name="feed_image",
shape=[None] + image_shape,
dtype="float32",
lod_level=0)
feed_label = fluid.data(
name="feed_label", shape=[None, 1], dtype="int64", lod_level=0)
feed_y_a = fluid.data(
name="feed_y_a", shape=[None, 1], dtype="int64", lod_level=0)
if is_train and args.use_mixup:
feed_y_b = fluid.data(
name="feed_y_b", shape=[None, 1], dtype="int64", lod_level=0)
feed_lam = fluid.data(
name="feed_lam", shape=[None, 1], dtype="float32", lod_level=0)
data_loader = fluid.io.DataLoader.from_generator(
capacity=64,
use_double_buffer=True,
iterable=True,
return_list=True)
return data_loader, [feed_image, feed_y_a, feed_y_b, feed_lam]
else:
data_loader = fluid.io.DataLoader.from_generator(
capacity=64,
use_double_buffer=True,
iterable=True,
return_list=True)
return data_loader, [feed_image, feed_label]
def print_info(pass_id, batch_id, print_step, metrics, time_info, info_mode):
"""print function
Args:
pass_id: epoch index
batch_id: batch index
print_step: the print_step arguments
metrics: message to print
time_info: time infomation
info_mode: mode
"""
if info_mode == "batch":
if batch_id % print_step == 0:
#if isinstance(metrics,np.ndarray):
# train and mixup output
if len(metrics) == 2:
loss, lr = metrics
print(
"[Pass {0}, train batch {1}] \tloss {2}, lr {3}, elapse {4}".
format(pass_id, batch_id, "%.5f" % loss, "%.5f" % lr,
"%2.4f sec" % time_info))
# train and no mixup output
elif len(metrics) == 4:
loss, acc1, acc5, lr = metrics
print(
"[Pass {0}, train batch {1}] \tloss {2}, acc1 {3}, acc5 {4}, lr {5}, elapse {6}".
format(pass_id, batch_id, "%.5f" % loss, "%.5f" % acc1,
"%.5f" % acc5, "%.5f" % lr, "%2.4f sec" % time_info))
# test output
elif len(metrics) == 3:
loss, acc1, acc5 = metrics
print(
"[Pass {0}, test batch {1}] \tloss {2}, acc1 {3}, acc5 {4}, elapse {5}".
format(pass_id, batch_id, "%.5f" % loss, "%.5f" % acc1,
"%.5f" % acc5, "%2.4f sec" % time_info))
else:
raise Exception(
"length of metrics {} is not implemented, It maybe caused by wrong format of build_program_output".
format(len(metrics)))
sys.stdout.flush()
elif info_mode == "epoch":
## TODO add time elapse
#if isinstance(metrics,np.ndarray):
if len(metrics) == 5:
train_loss, _, test_loss, test_acc1, test_acc5 = metrics
print(
"[End pass {0}]\ttrain_loss {1}, test_loss {2}, test_acc1 {3}, test_acc5 {4}".
format(pass_id, "%.5f" % train_loss, "%.5f" % test_loss, "%.5f"
% test_acc1, "%.5f" % test_acc5))
elif len(metrics) == 7:
train_loss, train_acc1, train_acc5, _, test_loss, test_acc1, test_acc5 = metrics
print(
"[End pass {0}]\ttrain_loss {1}, train_acc1 {2}, train_acc5 {3},test_loss {4}, test_acc1 {5}, test_acc5 {6}".
format(pass_id, "%.5f" % train_loss, "%.5f" % train_acc1, "%.5f"
% train_acc5, "%.5f" % test_loss, "%.5f" % test_acc1,
"%.5f" % test_acc5))
sys.stdout.flush()
elif info_mode == "ce":
raise Warning("CE code is not ready")
else:
raise Exception("Illegal info_mode")
def best_strategy_compiled(args, program, loss, exe):
"""make a program which wrapped by a compiled program
"""
if os.getenv('FLAGS_use_ngraph'):
return program
else:
build_strategy = fluid.compiler.BuildStrategy()
#Feature will be supported in Fluid v1.6
#build_strategy.enable_inplace = True
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = fluid.core.get_cuda_device_count()
exec_strategy.num_iteration_per_drop_scope = 10
num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
if num_trainers > 1 and args.use_gpu:
dist_utils.prepare_for_multi_process(exe, build_strategy, program)
# NOTE: the process is fast when num_threads is 1
# for multi-process training.
exec_strategy.num_threads = 1
compiled_program = fluid.CompiledProgram(program).with_data_parallel(
loss_name=loss.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
return compiled_program
class ExponentialMovingAverage(object):
def __init__(self,
decay=0.999,
thres_steps=None,
zero_debias=False,
name=None):
self._decay = decay
self._thres_steps = thres_steps
self._name = name if name is not None else ''
self._decay_var = self._get_ema_decay()
self._params_tmps = []
for param in default_main_program().global_block().all_parameters():
if param.do_model_average != False:
tmp = param.block.create_var(
name=unique_name.generate(".".join(
[self._name + param.name, 'ema_tmp'])),
dtype=param.dtype,
persistable=False,
stop_gradient=True)
self._params_tmps.append((param, tmp))
self._ema_vars = {}
for param, tmp in self._params_tmps:
with param.block.program._optimized_guard(
[param, tmp]), name_scope('moving_average'):
self._ema_vars[param.name] = self._create_ema_vars(param)
self.apply_program = Program()
block = self.apply_program.global_block()
with program_guard(main_program=self.apply_program):
decay_pow = self._get_decay_pow(block)
for param, tmp in self._params_tmps:
param = block._clone_variable(param)
tmp = block._clone_variable(tmp)
ema = block._clone_variable(self._ema_vars[param.name])
layers.assign(input=param, output=tmp)
# bias correction
if zero_debias:
ema = ema / (1.0 - decay_pow)
layers.assign(input=ema, output=param)
self.restore_program = Program()
block = self.restore_program.global_block()
with program_guard(main_program=self.restore_program):
for param, tmp in self._params_tmps:
tmp = block._clone_variable(tmp)
param = block._clone_variable(param)
layers.assign(input=tmp, output=param)
def _get_ema_decay(self):
with default_main_program()._lr_schedule_guard():
decay_var = layers.tensor.create_global_var(
shape=[1],
value=self._decay,
dtype='float32',
persistable=True,
name="scheduled_ema_decay_rate")
if self._thres_steps is not None:
decay_t = (self._thres_steps + 1.0) / (self._thres_steps + 10.0)
with layers.control_flow.Switch() as switch:
with switch.case(decay_t < self._decay):
layers.tensor.assign(decay_t, decay_var)
with switch.default():
layers.tensor.assign(
np.array(
[self._decay], dtype=np.float32),
decay_var)
return decay_var
def _get_decay_pow(self, block):
global_steps = layers.learning_rate_scheduler._decay_step_counter()
decay_var = block._clone_variable(self._decay_var)
decay_pow_acc = layers.elementwise_pow(decay_var, global_steps + 1)
return decay_pow_acc
def _create_ema_vars(self, param):
param_ema = layers.create_global_var(
name=unique_name.generate(self._name + param.name + '_ema'),
shape=param.shape,
value=0.0,
dtype=param.dtype,
persistable=True)
return param_ema
def update(self):
"""
Update Exponential Moving Average. Should only call this method in
train program.
"""
param_master_emas = []
for param, tmp in self._params_tmps:
with param.block.program._optimized_guard(
[param, tmp]), name_scope('moving_average'):
param_ema = self._ema_vars[param.name]
if param.name + '.master' in self._ema_vars:
master_ema = self._ema_vars[param.name + '.master']
param_master_emas.append([param_ema, master_ema])
else:
ema_t = param_ema * self._decay_var + param * (
1 - self._decay_var)
layers.assign(input=ema_t, output=param_ema)
# for fp16 params
for param_ema, master_ema in param_master_emas:
default_main_program().global_block().append_op(
type="cast",
inputs={"X": master_ema},
outputs={"Out": param_ema},
attrs={
"in_dtype": master_ema.dtype,
"out_dtype": param_ema.dtype
})
@signature_safe_contextmanager
def apply(self, executor, need_restore=True):
"""
Apply moving average to parameters for evaluation.
Args:
executor (Executor): The Executor to execute applying.
need_restore (bool): Whether to restore parameters after applying.
"""
executor.run(self.apply_program)
try:
yield
finally:
if need_restore:
self.restore(executor)
def restore(self, executor):
"""Restore parameters.
Args:
executor (Executor): The Executor to execute restoring.
"""
executor.run(self.restore_program)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册