add ofa demo

e830dfa9 · xiteng1988 · a596e2b6 · e830dfa9 · e830dfa9 · e830dfa9
14 changed file
--- a/demo/once_for_all/dy_models/__init__.py
+++ b/demo/once_for_all/dy_models/__init__.py
+from .once_for_all_kernel import OFA_kernel
+from .mobilenet_v1 import MobileNetV1
+from .mobilenet_v2 import MobileNetV2
+from .mobilenet_v3 import MobileNetV3_dy
--- a/demo/once_for_all/dy_models/func_conv.py
+++ b/demo/once_for_all/dy_models/func_conv.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+from __future__ import print_function
+
+__all__ = ['conv2d', 'conv2d_transpose', 'conv3d', 'conv3d_transpose']
+
+import numpy as np
+from paddle.fluid.framework import Variable, in_dygraph_mode
+from paddle.fluid import core, dygraph_utils
+from paddle.fluid.layers import nn, utils
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.layer_helper import LayerHelper
+
+
+def _is_list_or_tuple(input):
+    return isinstance(input, (list, tuple))
+
+
+def _zero_padding_in_batch_and_channel(padding, channel_last):
+    if channel_last:
+        return list(padding[0]) == [0, 0] and list(padding[-1]) == [0, 0]
+    else:
+        return list(padding[0]) == [0, 0] and list(padding[1]) == [0, 0]
+
+
+def _exclude_padding_in_batch_and_channel(padding, channel_last):
+    padding_ = padding[1:-1] if channel_last else padding[2:]
+    padding_ = [elem for pad_a_dim in padding_ for elem in pad_a_dim]
+    return padding_
+
+
+def _update_padding_nd(padding, channel_last, num_dims):
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".
+                format(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0] * num_dims
+        else:
+            padding_algorithm = "SAME"
+            padding = [0] * num_dims
+    elif _is_list_or_tuple(padding):
+        # for padding like
+        # [(pad_before, pad_after), (pad_before, pad_after), ...]
+        # padding for batch_dim and channel_dim included
+        if len(padding) == 2 + num_dims and _is_list_or_tuple(padding[0]):
+            if not _zero_padding_in_batch_and_channel(padding, channel_last):
+                raise ValueError(
+                    "Non-zero padding({}) in the batch or channel dimensions "
+                    "is not supported.".format(padding))
+            padding_algorithm = "EXPLICIT"
+            padding = _exclude_padding_in_batch_and_channel(padding,
+                                                            channel_last)
+            if utils._is_symmetric_padding(padding, num_dims):
+                padding = padding[0::2]
+        # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
+        elif len(padding) == 2 * num_dims and isinstance(padding[0], int):
+            padding_algorithm = "EXPLICIT"
+            padding = utils.convert_to_list(padding, 2 * num_dims, 'padding')
+            if utils._is_symmetric_padding(padding, num_dims):
+                padding = padding[0::2]
+        # for padding like [pad_d1, pad_d2, ...]
+        elif len(padding) == num_dims and isinstance(padding[0], int):
+            padding_algorithm = "EXPLICIT"
+            padding = utils.convert_to_list(padding, num_dims, 'padding')
+        else:
+            raise ValueError("In valid padding: {}".format(padding))
+    # for integer padding
+    else:
+        padding_algorithm = "EXPLICIT"
+        padding = utils.convert_to_list(padding, num_dims, 'padding')
+    return padding, padding_algorithm
+
+
+def conv2d(input,
+           weight,
+           bias=None,
+           padding=0,
+           stride=1,
+           dilation=1,
+           groups=1,
+           use_cudnn=True,
+           act=None,
+           data_format="NCHW",
+           name=None):
+    """
+	:alias_main: paddle.nn.functional.conv2d
+	:alias: paddle.nn.functional.conv2d,paddle.nn.functional.conv.conv2d
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW or NHWC format, where N is batch size, C is the number of
+    channels, H is the height of the feature, and W is the width of the feature.
+    Filter is in MCHW format, where M is the number of output image channels,
+    C is the number of input image channels, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input image channels divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more details.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+    For each input :math:`X`, the equation is:
+    .. math::
+        Out = \sigma (W \\ast X + b)
+    Where:
+    * :math:`X`: Input value, a tensor with NCHW or NHWC format.
+    * :math:`W`: Filter value, a tensor with MCHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    Example:
+        - Input:
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
+        - Output:
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+    Args:
+        input (Variable): The input is 4-D Tensor with shape [N, C, H, W], the data type 
+            of input is float16 or float32 or float64.
+        weight (Variable): The convolution kernel with shape [M, C/g, kH, kW], where M is
+            the number of output channels, g is the number of groups, kH is the filter's
+            height, kW is the filter's width. 
+        bias (Variable, optional): The bias with shape [M,].
+        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
+            on both sides for each dimension.If `padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when 
+            `data_format` is `"NCHW"`, `padding` can be in the form `[[0,0], [0,0], 
+            [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Default: padding = 0.
+        stride (int|tuple): The stride size. It means the stride in convolution. 
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
+        dilation (int|tuple): The dilation size. It means the spacing between the kernel
+            points. If dilation is a tuple, it must contain two integers, (dilation_height, 
+            dilation_width). Otherwise, dilation_height = dilation_width = dilation. 
+            Default: dilation = 1.
+        groups (int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
+            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer 
+           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           None by default.
+    Returns:
+        A Variable holding Tensor representing the conv2d, whose data type is the 
+        same with input. If act is None, the tensor variable storing the convolution 
+        result, and if act is not None, the tensor variable storing convolution 
+        and non-linearity activation result.
+    Raises:
+        ValueError: If the type of `use_cudnn` is not bool.
+        ValueError: If `data_format` is not "NCHW" or "NHWC".
+        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+            or the element corresponding to the input's channel is not 0.
+        ShapeError: If the input is not 4-D Tensor.
+        ShapeError: If the input's dimension size and filter's dimension size not equal.
+        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
+        ShapeError: If the number of input channels is not equal to filter's channels * groups.
+        ShapeError: If the number of output channels is not be divided by groups.
+    Examples:
+        .. code-block:: python
+          from paddle import fluid
+          import paddle.nn.functional as F
+          import paddle.fluid.dygraph as dg
+          import numpy as np
+          x = np.random.randn(2, 3, 8, 8).astype(np.float32)
+          w = np.random.randn(6, 3, 3, 3).astype(np.float32)
+          place = fluid.CPUPlace()
+          with dg.guard(place):
+              x_var = dg.to_variable(x)
+              w_var = dg.to_variable(w)
+              y_var = F.conv2d(x_var, w_var, act="relu")
+              y_np = y_var.numpy()
+          print(y_np.shape)
+          # (2, 6, 6, 6)
+    """
+    # entry checks
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("Attr(use_cudnn) should be True or False. "
+                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
+                         "Received Attr(data_format): {}.".format(data_format))
+
+    channel_last = (data_format == "NHWC")
+    channel_dim = -1 if channel_last else 1
+    num_channels = input.shape[channel_dim]
+    num_filters = weight.shape[0]
+    if num_channels < 0:
+        raise ValueError("The channel dimmention of the input({}) "
+                         "should be defined. Received: {}.".format(
+                             input.shape, num_channels))
+    if num_channels % groups != 0:
+        raise ValueError(
+            "the channel of input must be divisible by groups,"
+            "received: the channel of input is {}, the shape of input is {}"
+            ", the groups is {}".format(num_channels, input.shape, groups))
+    if num_filters % groups != 0:
+        raise ValueError(
+            "the number of filters must be divisible by groups,"
+            "received: the number of filters is {}, the shape of weight is {}"
+            ", the groups is {}".format(num_filters, weight.shape, groups))
+
+    # update attrs
+    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
+    stride = utils.convert_to_list(stride, 2, 'stride')
+    dilation = utils.convert_to_list(dilation, 2, 'dilation')
+
+    l_type = "conv2d"
+    if (num_channels == groups and num_filters % num_channels == 0 and
+            not use_cudnn):
+        l_type = 'depthwise_conv2d'
+
+    inputs = {'Input': [input], 'Filter': [weight]}
+    attrs = {
+        'strides': stride,
+        'paddings': padding,
+        'dilations': dilation,
+        'groups': groups,
+        'use_cudnn': use_cudnn,
+        'use_mkldnn': False,
+        'fuse_relu_before_depthwise_conv': False,
+        "padding_algorithm": padding_algorithm,
+        "data_format": data_format
+    }
+
+    if in_dygraph_mode():
+        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
+                 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
+                 'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
+                 padding_algorithm, "data_format", data_format)
+        pre_bias = getattr(core.ops, l_type)(input, weight, *attrs)
+        if bias is not None:
+            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+        else:
+            pre_act = pre_bias
+        out = dygraph_utils._append_activation_in_dygraph(
+            pre_act, act, use_cudnn=use_cudnn)
+    else:
+        inputs = {'Input': [input], 'Filter': [weight]}
+        attrs = {
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': False,
+            'fuse_relu_before_depthwise_conv': False,
+            "padding_algorithm": padding_algorithm,
+            "data_format": data_format
+        }
+        check_variable_and_dtype(input, 'input',
+                                 ['float16', 'float32', 'float64'], 'conv2d')
+        helper = LayerHelper(l_type, **locals())
+        dtype = helper.input_dtype()
+        pre_bias = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Output": [pre_bias]}
+        helper.append_op(
+            type=l_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        if bias is not None:
+            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+        else:
+            pre_act = pre_bias
+        out = helper.append_activation(pre_act)
+    return out
+
+
+def conv2d_transpose(input,
+                     weight,
+                     bias=None,
+                     output_size=None,
+                     padding=0,
+                     stride=1,
+                     dilation=1,
+                     groups=1,
+                     use_cudnn=True,
+                     act=None,
+                     data_format='NCHW',
+                     name=None):
+    """
+	:alias_main: paddle.nn.functional.conv2d_transpose
+	:alias: paddle.nn.functional.conv2d_transpose,paddle.nn.functional.conv.conv2d_transpose
+    The convolution2D transpose layer calculates the output based on the input,
+    filter, and dilations, strides, paddings. Input(Input) and output(Output)
+    are in NCHW or NHWC format. Where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+    Parameters(dilations, strides, paddings) are two elements. These two elements
+    represent height and width, respectively. The details of convolution transpose
+    layer, please refer to the following explanation and references
+    `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+    For each input :math:`X`, the equation is:
+    .. math::
+        Out = \sigma (W \\ast X + b)
+    Where:
+    * :math:`X`: Input value, a 4-D Tensor with NCHW or NHWC format.
+    * :math:`W`: Filter value, a 4-D Tensor with MCHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, a 4-D Tensor with data format 'NCHW' or 'NHWC', the shape of :math:`Out` and :math:`X` may be different.
+    Example:
+        - Input:
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
+        - Output:
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           H^\prime_{out} &= (H_{in} - 1) * strides[0] - pad_height_top - pad_height_bottom + dilations[0] * (H_f - 1) + 1 \\\\
+           W^\prime_{out} &= (W_{in} - 1) * strides[1] - pad_width_left - pad_width_right + dilations[1] * (W_f - 1) + 1 \\\\
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ] \\\\
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] ]
+    Note:
+          The conv2d_transpose can be seen as the backward of the conv2d. For conv2d, 
+          when stride > 1, conv2d maps multiple input shape to the same output shape, 
+          so for conv2d_transpose, when stride > 1, input shape maps multiple output shape.
+          If output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`; 
+          else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
+          and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must 
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`, 
+          conv2d_transpose can compute the kernel size automatically.
+    Args:
+        input(Variable): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format,
+            whose data type is float32 or float64.
+        weight(Variable): The convolution kernel, a Tensor with shape [C, M/g, kH, kW],
+            where M is the number of output channels(filters), g is the number of groups,
+            kH is the height of the kernel, and kW is the width of the kernel.
+        bias(Variable, optional): The bias, a Tensor with shape [M, ].
+        output_size(int|tuple|list, optional): The output image size. If output size is a
+            tuple, it must contain two integers, (image_height, image_width). None if use
+            filter_size, padding, and stride to calculate output_size.
+            If output_size is specified, output_size and filter_size (weight)'s shape 
+            should follow the formula above. Default: None. output_size and filter_size 
+            should not be None at the same time.
+        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
+             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
+             string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
+             If `padding` is a tuple or list, it could be in three forms:
+             `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and
+            when `data_format` is `'NCHW'`,
+            `padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `'NHWC'`, `padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Default: padding = 0.
+        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
+        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
+            Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
+        groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups = 1.
+        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True.
+        act (str, optional): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
+            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer 
+           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           None by default.
+    Returns:
+        A Variable holding Tensor representing the conv2d_transpose, whose 
+        data type is the same with input and shape is (num_batches, channels, out_h, 
+        out_w) or (num_batches, out_h, out_w, channels). If act is None, the tensor variable 
+        storing the transposed convolution result, and if act is not None, the 
+        tensor variable storing transposed convolution and non-linearity activation 
+        result.
+    Raises:
+        ValueError: If the type of `use_cudnn` is not bool.
+        ValueError: If `data_format` is not "NCHW" or "NHWC".
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+            or the element corresponding to the input's channel is not 0.
+        ValueError: If `output_size` and filter_size are None at the same time.
+        ShapeError: If the input is not 4-D Tensor.
+        ShapeError: If the input's dimension size and filter's dimension size not equal.
+        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
+        ShapeError: If the number of input channels is not equal to filter's channels.
+        ShapeError: If the size of `output_size` is not equal to that of `stride`.
+    Examples:
+        .. code-block:: python
+          from paddle import fluid
+          import paddle.nn.functional as F
+          import paddle.fluid.dygraph as dg
+          import numpy as np
+          x = np.random.randn(2, 3, 8, 8).astype(np.float32)
+          w = np.random.randn(3, 6, 3, 3).astype(np.float32)
+          place = fluid.CPUPlace()
+          with dg.guard(place):
+              x_var = dg.to_variable(x)
+              w_var = dg.to_variable(w)
+              y_var = F.conv2d_transpose(x_var, w_var, act="relu")
+              y_np = y_var.numpy()
+          print(y_np.shape)
+          # (2, 6, 10, 10)
+    """
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("Attr(use_cudnn) should be True or False. "
+                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
+    if data_format not in ['NCHW', 'NHWC']:
+        raise ValueError(
+            "Attr(data_format) of conv2d_transpose got wrong value: "
+            "received {}, but only 'NCHW' or 'NHWC' are supported.".format(
+                data_format))
+    channel_last = (data_format == "NHWC")
+    channel_dim = -1 if channel_last else 1
+    num_channels = input.shape[channel_dim]
+    if num_channels < 0:
+        raise ValueError("The channel dimmention of the input({}) "
+                         "should be defined. Received: {}.".format(
+                             input.shape, num_channels))
+    if num_channels % groups != 0:
+        raise ValueError(
+            "the channel of input must be divisible by groups,"
+            "received: the channel of input is {}, the shape of input is {}"
+            ", the groups is {}".format(num_channels, input.shape, groups))
+
+    # update attrs
+    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
+    stride = utils.convert_to_list(stride, 2, 'stride')
+    dilation = utils.convert_to_list(dilation, 2, 'dilation')
+    if output_size is None:
+        output_size = []
+    elif isinstance(output_size, (list, tuple, int)):
+        output_size = utils.convert_to_list(output_size, 2, 'output_size')
+    else:
+        raise ValueError("output_size should be int, or list, tuple of ints")
+
+    op_type = 'conv2d_transpose'
+    num_filters = weight.shape[1]
+    if (num_channels == groups and num_filters == 1 and not use_cudnn):
+        op_type = 'depthwise_conv2d_transpose'
+
+    if in_dygraph_mode():
+        attrs = ('output_size', output_size, 'strides', stride, 'paddings',
+                 padding, 'padding_algorithm', padding_algorithm, 'dilations',
+                 dilation, 'groups', groups, 'use_cudnn', use_cudnn,
+                 'data_format', data_format)
+        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
+        if bias is not None:
+            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+        else:
+            pre_act = pre_bias
+        out = dygraph_utils._append_activation_in_dygraph(
+            pre_act, act, use_cudnn=use_cudnn)
+    else:
+        inputs = {'Input': [input], 'Filter': [weight]}
+        attrs = {
+            'output_size': output_size,
+            'strides': stride,
+            'paddings': padding,
+            'padding_algorithm': padding_algorithm,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'data_format': data_format
+        }
+        check_variable_and_dtype(input, 'input',
+                                 ['float16', 'float32', 'float64'],
+                                 'conv2d_transpose')
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype()
+        pre_bias = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Output": [pre_bias]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        if bias is not None:
+            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+        else:
+            pre_act = pre_bias
+        out = helper.append_activation(pre_act)
+    return out
+
+
+def conv3d(input,
+           weight,
+           bias=None,
+           padding=0,
+           stride=1,
+           dilation=1,
+           groups=1,
+           use_cudnn=True,
+           act=None,
+           data_format="NCDHW",
+           name=None):
+    """
+	:alias_main: paddle.nn.functional.conv3d
+	:alias: paddle.nn.functional.conv3d,paddle.nn.functional.conv.conv3d
+    The convolution3D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input(Input) and
+    Output(Output) are in NCDHW or NDHWC format. Where N is batch size C is the number of
+    channels, D is the depth of the feature, H is the height of the feature,
+    and W is the width of the feature. Convlution3D is similar with Convlution2D
+    but adds one dimension(depth). If bias attribution and activation type are
+    provided, bias is added to the output of the convolution, and the
+    corresponding activation function is applied to the final result.
+    For each input :math:`X`, the equation is:
+    .. math::
+        Out = \sigma (W \\ast X + b)
+    In the above equation:
+    * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
+    * :math:`W`: Filter value, a tensor with MCDHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    Example:
+        - Input:
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
+        - Output:
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
+    Args:
+        input (Variable): The input is 5-D Tensor with shape [N, C, D, H, W], the data 
+            type of input is float16 or float32 or float64.
+        weight (Variable): The convolution kernel, a Tensor with shape [M, C/g, kD, kH, kW],
+            where M is the number of filters(output channels), g is the number of groups,
+            kD, kH, kW are the filter's depth, height and width respectively.
+        bias (Variable, optional): The bias, a Tensor of shape [M, ].
+        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
+            on both sides for each dimension. If `padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Default: padding = 0.
+        stride (int|tuple): The stride size. It means the stride in convolution. If stride is a 
+            tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
+            Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
+        dilation (int|tuple): The dilation size. It means the spacing between the kernel points. 
+            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height,
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+            Default: dilation = 1.
+        groups (int): The groups number of the Conv3d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
+            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
+        name(str|None): For detailed information, please refer 
+           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           None by default.
+    Returns:
+        A Variable holding Tensor representing the conv3d, whose data type is 
+        the same with input. If act is None, the tensor variable storing the 
+        convolution result, and if act is not None, the tensor variable storing 
+        convolution and non-linearity activation result.
+    Raises:
+        ValueError: If the type of `use_cudnn` is not bool.
+        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
+        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+            or the element corresponding to the input's channel is not 0.
+        ShapeError: If the input is not 5-D Tensor.
+        ShapeError: If the input's dimension size and filter's dimension size not equal.
+        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
+        ShapeError: If the number of input channels is not equal to filter's channels * groups.
+        ShapeError: If the number of output channels is not be divided by groups.
+    Examples:
+        .. code-block:: python
+            from paddle import fluid
+            import paddle.nn.functional as F
+            import paddle.fluid.dygraph as dg
+            import numpy as np
+            x = np.random.randn(2, 3, 8, 8, 8).astype(np.float32)
+            w = np.random.randn(6, 3, 3, 3, 3).astype(np.float32)
+            place = fluid.CPUPlace()
+            with dg.guard(place):
+                x_var = dg.to_variable(x)
+                w_var = dg.to_variable(w)
+                y_var = F.conv3d(x_var, w_var, act="relu")
+                y_np = y_var.numpy()
+            print(y_np.shape)
+            # (2, 6, 6, 6, 6)
+    """
+    # entry check
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("Attr(use_cudnn) should be True or False. Received "
+                         "Attr(use_cudnn): {}. ".format(use_cudnn))
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): {}.".format(data_format))
+
+    channel_last = (data_format == "NDHWC")
+    channel_dim = -1 if channel_last else 1
+    num_channels = input.shape[channel_dim]
+    num_filters = weight.shape[0]
+    if num_channels < 0:
+        raise ValueError(
+            "The channel dimmention of the input({}) should be defined. "
+            "Received: {}.".format(input.shape, num_channels))
+    if num_channels % groups != 0:
+        raise ValueError(
+            "The number of input channels must be divisible by Attr(groups). "
+            "Received: number of channels({}), groups({}).".format(num_channels,
+                                                                   groups))
+    if num_filters % groups != 0:
+        raise ValueError(
+            "The number of filters must be divisible by Attr(groups). "
+            "Received: number of filters({}), groups({}).".format(num_filters,
+                                                                  groups))
+
+    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
+    stride = utils.convert_to_list(stride, 3, 'stride')
+    dilation = utils.convert_to_list(dilation, 3, 'dilation')
+    op_type = "conv3d"
+
+    if in_dygraph_mode():
+        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
+                 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
+                 "padding_algorithm", padding_algorithm, "data_format",
+                 data_format)
+        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
+        if bias is not None:
+            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+        else:
+            pre_act = pre_bias
+        out = dygraph_utils._append_activation_in_dygraph(
+            pre_act, act, use_cudnn=use_cudnn)
+    else:
+        inputs = {'Input': [input], 'Filter': [weight]}
+        attrs = {
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': False,
+            "padding_algorithm": padding_algorithm,
+            "data_format": data_format
+        }
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype()
+        check_variable_and_dtype(input, 'input',
+                                 ['float16', 'float32', 'float64'], 'conv3d')
+
+        pre_bias = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Output": [pre_bias]}
+
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        if bias is not None:
+            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+        else:
+            pre_act = pre_bias
+        out = helper.append_activation(pre_act)
+
+    return out
+
+
+def conv3d_transpose(input,
+                     weight,
+                     bias=None,
+                     output_size=None,
+                     padding=0,
+                     stride=1,
+                     dilation=1,
+                     groups=1,
+                     use_cudnn=True,
+                     act=None,
+                     data_format='NCDHW',
+                     name=None):
+    """
+	:alias_main: paddle.nn.functional.conv3d_transpose
+	:alias: paddle.nn.functional.conv3d_transpose,paddle.nn.functional.conv.conv3d_transpose
+    The convolution3D transpose layer calculates the output based on the input,
+    filter, and dilations, strides, paddings. Input(Input) and output(Output)
+    are in NCDHW or NDHWC format. Where N is batch size, C is the number of channels,
+    D is the depth of the feature, H is the height of the feature, and W
+    is the width of the feature. Parameters(dilations, strides, paddings) are
+    two elements. These two elements represent height and width, respectively.
+    The details of convolution transpose layer, please refer to the following
+    explanation and references `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+    For each input :math:`X`, the equation is:
+    .. math::
+        Out = \sigma (W \\ast X + b)
+    In the above equation:
+    * :math:`X`: Input value, a Tensor with NCDHW or NDHWC format.
+    * :math:`W`: Filter value, a Tensor with MCDHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    Example:
+        - Input:
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
+        - Output:
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
+           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
+           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 \\\\
+           D_{out} &\in [ D^\prime_{out}, D^\prime_{out} + strides[0] ] \\\\
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[1] ] \\\\
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[2] ]
+    Note:
+          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d, 
+          when stride > 1, conv3d maps multiple input shape to the same output shape, 
+          so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
+          If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
+          H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output 
+          size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
+          the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
+          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
+          conv3d_transpose can compute the kernel size automatically.
+    Args:
+        input(Variable): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
+            of input is float32 or float64.
+        weight (Variable): The convolution kernel, a Tensor with shape [C, M/g, kD, kH, kW],
+            where M is the number of filters(output channels), g is the number of groups,
+            kD, kH, kW are the filter's depth, height and width respectively.
+        bias (Variable, optional): The bias, a Tensor of shape [M, ].
+        output_size(int|tuple, optional): The output image size. If output size is a
+            tuple, it must contain three integers, (image_depth, image_height, image_width). This
+            parameter only works when filter_size is None. If output_size and filter_size are 
+            specified at the same time, They should follow the formula above. Default: None. 
+            Output_size and filter_size should not be None at the same time.
+        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively
+             adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string,
+             either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding`
+             is a tuple or list, it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `'NCDHW'`, `padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `'NDHWC'`, `padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Default: padding = 0.
+        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
+            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
+            Default: stride = 1.
+        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+            Default: dilation = 1.
+        groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups=1
+        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act (str, optional): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
+            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer 
+           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           None by default.
+    Returns:
+        A Variable holding Tensor representing the conv3d_transpose, whose data 
+        type is the same with input and shape is (num_batches, channels, out_d, out_h, 
+        out_w) or (num_batches, out_d, out_h, out_w, channels). If act is None, the tensor 
+        variable storing the transposed convolution result, and if act is not None, the tensor 
+        variable storing transposed convolution and non-linearity activation result.
+    Raises:
+        ValueError: If the type of `use_cudnn` is not bool.
+        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+            or the element corresponding to the input's channel is not 0.
+        ValueError: If `output_size` and filter_size are None at the same time.
+        ShapeError: If the input is not 5-D Tensor.
+        ShapeError: If the input's dimension size and filter's dimension size not equal.
+        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
+        ShapeError: If the number of input channels is not equal to filter's channels.
+        ShapeError: If the size of `output_size` is not equal to that of `stride`.
+    Examples:
+       .. code-block:: python
+          from paddle import fluid
+          import paddle.nn.functional as F
+          import paddle.fluid.dygraph as dg
+          import numpy as np
+          x = np.random.randn(2, 3, 8, 8, 8).astype(np.float32)
+          w = np.random.randn(3, 6, 3, 3, 3).astype(np.float32)
+          place = fluid.CPUPlace()
+          with dg.guard(place):
+              x_var = dg.to_variable(x)
+              w_var = dg.to_variable(w)
+              y_var = F.conv3d_transpose(x_var, w_var, act="relu")
+              y_np = y_var.numpy()
+          print(y_np.shape)
+          # (2, 6, 10, 10, 10)
+    """
+    # entry checks
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("Attr(use_cudnn) should be True or False. "
+                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): {}.".format(data_format))
+
+    channel_last = (data_format == "NDHWC")
+    channel_dim = -1 if channel_last else 1
+    num_channels = input.shape[channel_dim]
+    num_filters = weight.shape[1]
+    if num_channels < 0:
+        raise ValueError(
+            "The channel dimmention of the input({}) should be defined. "
+            "Received: {}.".format(input.shape, num_channels))
+    if num_channels % groups != 0:
+        raise ValueError(
+            "The number of input channels must be divisible by Attr(groups). "
+            "Received: number of channels({}), groups({}).".format(num_channels,
+                                                                   groups))
+
+    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
+    stride = utils.convert_to_list(stride, 3, 'stride')
+    dilation = utils.convert_to_list(dilation, 3, 'dilation')
+    if output_size is None:
+        output_size = []
+    elif isinstance(output_size, (list, tuple, int)):
+        output_size = utils.convert_to_list(output_size, 3, 'output_size')
+    else:
+        raise ValueError("output_size should be int, or list, tuple of ints")
+
+    op_type = 'conv3d_transpose'
+    data_format_ = "NHWC" if channel_last else "NCHW"
+
+    if in_dygraph_mode():
+        attrs = ('output_size', output_size, 'paddings', padding,
+                 "padding_algorithm", padding_algorithm, 'strides', stride,
+                 'dilations', dilation, 'groups', groups, 'use_cudnn',
+                 use_cudnn, "data_format", data_format_)
+        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
+        if bias is not None:
+            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+        else:
+            pre_act = pre_bias
+        out = dygraph_utils._append_activation_in_dygraph(
+            pre_act, act, use_cudnn=use_cudnn)
+    else:
+        inputs = {'Input': [input], 'Filter': [weight]}
+        attrs = {
+            'output_size': output_size,
+            'paddings': padding,
+            "padding_algorithm": padding_algorithm,
+            'strides': stride,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            "data_format": data_format_
+        }
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype()
+        check_variable_and_dtype(input, 'input',
+                                 ['float16', 'float32', 'float64'], 'conv3d')
+
+        pre_bias = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Output": [pre_bias]}
+
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        if bias is not None:
+            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+        else:
+            pre_act = pre_bias
+        out = helper.append_activation(pre_act)
+
+    return out
--- a/demo/once_for_all/dy_models/mobilenet_v1.py
+++ b/demo/once_for_all/dy_models/mobilenet_v1.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import os
+import time
+import sys
+import math
+import numpy as np
+import argparse
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid import framework
+
+
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 act='relu',
+                 use_cudnn=True,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=self.full_name() + "_weights"),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
+            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
+            moving_mean_name=self.full_name() + "_bn" + '_mean',
+            moving_variance_name=self.full_name() + "_bn" + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class DepthwiseSeparable(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters1,
+                 num_filters2,
+                 num_groups,
+                 stride,
+                 scale,
+                 name=None):
+        super(DepthwiseSeparable, self).__init__()
+
+        self._depthwise_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=int(num_filters1 * scale),
+            filter_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            use_cudnn=False)
+
+        self._pointwise_conv = ConvBNLayer(
+            num_channels=int(num_filters1 * scale),
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+
+    def forward(self, inputs):
+        y = self._depthwise_conv(inputs)
+        y = self._pointwise_conv(y)
+        return y
+
+
+class MobileNetV1(fluid.dygraph.Layer):
+    def __init__(self, scale=1.0, class_dim=1000):
+        super(MobileNetV1, self).__init__()
+        self.scale = scale
+        self.dwsl = []
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1)
+
+        dws21 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(32 * scale),
+                num_filters1=32,
+                num_filters2=64,
+                num_groups=32,
+                stride=1,
+                scale=scale),
+            name="conv2_1")
+        self.dwsl.append(dws21)
+
+        dws22 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(64 * scale),
+                num_filters1=64,
+                num_filters2=128,
+                num_groups=64,
+                stride=2,
+                scale=scale),
+            name="conv2_2")
+        self.dwsl.append(dws22)
+
+        dws31 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(128 * scale),
+                num_filters1=128,
+                num_filters2=128,
+                num_groups=128,
+                stride=1,
+                scale=scale),
+            name="conv3_1")
+        self.dwsl.append(dws31)
+
+        dws32 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(128 * scale),
+                num_filters1=128,
+                num_filters2=256,
+                num_groups=128,
+                stride=2,
+                scale=scale),
+            name="conv3_2")
+        self.dwsl.append(dws32)
+
+        dws41 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(256 * scale),
+                num_filters1=256,
+                num_filters2=256,
+                num_groups=256,
+                stride=1,
+                scale=scale),
+            name="conv4_1")
+        self.dwsl.append(dws41)
+
+        dws42 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(256 * scale),
+                num_filters1=256,
+                num_filters2=512,
+                num_groups=256,
+                stride=2,
+                scale=scale),
+            name="conv4_2")
+        self.dwsl.append(dws42)
+
+        for i in range(5):
+            tmp = self.add_sublayer(
+                sublayer=DepthwiseSeparable(
+                    num_channels=int(512 * scale),
+                    num_filters1=512,
+                    num_filters2=512,
+                    num_groups=512,
+                    stride=1,
+                    scale=scale),
+                name="conv5_" + str(i + 1))
+            self.dwsl.append(tmp)
+
+        dws56 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(512 * scale),
+                num_filters1=512,
+                num_filters2=1024,
+                num_groups=512,
+                stride=2,
+                scale=scale),
+            name="conv5_6")
+        self.dwsl.append(dws56)
+
+        dws6 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(1024 * scale),
+                num_filters1=1024,
+                num_filters2=1024,
+                num_groups=1024,
+                stride=1,
+                scale=scale),
+            name="conv6")
+        self.dwsl.append(dws6)
+
+        self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+
+        self.out = Linear(
+            int(1024 * scale),
+            class_dim,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=self.full_name() + "fc7_weights"),
+            bias_attr=ParamAttr(name="fc7_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1(inputs)
+        for dws in self.dwsl:
+            y = dws(y)
+        y = self.pool2d_avg(y)
+        y = fluid.layers.reshape(y, shape=[-1, 1024])
+        y = self.out(y)
+        return y
--- a/demo/once_for_all/dy_models/mobilenet_v2.py
+++ b/demo/once_for_all/dy_models/mobilenet_v2.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import os
+import time
+import math
+import sys
+import numpy as np
+import argparse
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid import framework
+
+
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 use_cudnn=True):
+        super(ConvBNLayer, self).__init__()
+
+        tmp_param = ParamAttr(name=self.full_name() + "_weights")
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=tmp_param,
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
+            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
+            moving_mean_name=self.full_name() + "_bn" + '_mean',
+            moving_variance_name=self.full_name() + "_bn" + '_variance')
+
+    def forward(self, inputs, if_act=True):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if if_act:
+            y = fluid.layers.relu6(y)
+        return y
+
+
+class InvertedResidualUnit(fluid.dygraph.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_in_filter,
+            num_filters,
+            stride,
+            filter_size,
+            padding,
+            expansion_factor, ):
+        super(InvertedResidualUnit, self).__init__()
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+        self._expand_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_expfilter,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1)
+
+        self._bottleneck_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_expfilter,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            num_groups=num_expfilter,
+            use_cudnn=False)
+
+        self._linear_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_filters,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1)
+
+    def forward(self, inputs, ifshortcut):
+        y = self._expand_conv(inputs, if_act=True)
+        y = self._bottleneck_conv(y, if_act=True)
+        y = self._linear_conv(y, if_act=False)
+        if ifshortcut:
+            y = fluid.layers.elementwise_add(inputs, y)
+        return y
+
+
+class InvresiBlocks(fluid.dygraph.Layer):
+    def __init__(self, in_c, t, c, n, s):
+        super(InvresiBlocks, self).__init__()
+
+        self._first_block = InvertedResidualUnit(
+            num_channels=in_c,
+            num_in_filter=in_c,
+            num_filters=c,
+            stride=s,
+            filter_size=3,
+            padding=1,
+            expansion_factor=t)
+
+        self._inv_blocks = []
+        for i in range(1, n):
+            tmp = self.add_sublayer(
+                sublayer=InvertedResidualUnit(
+                    num_channels=c,
+                    num_in_filter=c,
+                    num_filters=c,
+                    stride=1,
+                    filter_size=3,
+                    padding=1,
+                    expansion_factor=t),
+                name=self.full_name() + "_" + str(i + 1))
+            self._inv_blocks.append(tmp)
+
+    def forward(self, inputs):
+        y = self._first_block(inputs, ifshortcut=False)
+        for inv_block in self._inv_blocks:
+            y = inv_block(y, ifshortcut=True)
+        return y
+
+
+class MobileNetV2(fluid.dygraph.Layer):
+    def __init__(self, class_dim=1000, scale=1.0):
+        super(MobileNetV2, self).__init__()
+        self.scale = scale
+        self.class_dim = class_dim
+
+        bottleneck_params_list = [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),
+            (6, 32, 3, 2),
+            (6, 64, 4, 2),
+            (6, 96, 3, 1),
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),
+        ]
+
+        #1. conv1 
+        self._conv1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=int(32 * scale),
+            filter_size=3,
+            stride=2,
+            padding=1)
+
+        #2. bottleneck sequences
+        self._invl = []
+        i = 1
+        in_c = int(32 * scale)
+        for layer_setting in bottleneck_params_list:
+            t, c, n, s = layer_setting
+            i += 1
+            tmp = self.add_sublayer(
+                sublayer=InvresiBlocks(
+                    in_c=in_c, t=t, c=int(c * scale), n=n, s=s),
+                name='conv' + str(i))
+            self._invl.append(tmp)
+            in_c = int(c * scale)
+
+        #3. last_conv
+        self._out_c = int(1280 * scale) if scale > 1.0 else 1280
+        self._conv9 = ConvBNLayer(
+            num_channels=in_c,
+            num_filters=self._out_c,
+            filter_size=1,
+            stride=1,
+            padding=0)
+
+        #4. pool
+        self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+
+        #5. fc
+        tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
+        self._fc = Linear(
+            self._out_c,
+            class_dim,
+            param_attr=tmp_param,
+            bias_attr=ParamAttr(name="fc10_offset"))
+
+    def forward(self, inputs):
+        y = self._conv1(inputs, if_act=True)
+        for inv in self._invl:
+            y = inv(y)
+        y = self._conv9(y, if_act=True)
+        y = self._pool2d_avg(y)
+        y = fluid.layers.reshape(y, shape=[-1, self._out_c])
+        y = self._fc(y)
+        return y
--- a/demo/once_for_all/dy_models/mobilenet_v3.py
+++ b/demo/once_for_all/dy_models/mobilenet_v3.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class MobileNetV3_dy(fluid.dygraph.Layer):
+    def __init__(self, scale=1.0, model_name='small', class_dim=1000):
+        super(MobileNetV3_dy, self).__init__()
+
+        inplanes = 16
+        if model_name == "large":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, 'relu', 1],
+                [3, 64, 24, False, 'relu', 2],
+                [3, 72, 24, False, 'relu', 1],
+                [5, 72, 40, True, 'relu', 2],
+                [5, 120, 40, True, 'relu', 1],
+                [5, 120, 40, True, 'relu', 1],
+                [3, 240, 80, False, 'hard_swish', 2],
+                [3, 200, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 480, 112, True, 'hard_swish', 1],
+                [3, 672, 112, True, 'hard_swish', 1],
+                [5, 672, 160, True, 'hard_swish', 2],
+                [5, 960, 160, True, 'hard_swish', 1],
+                [5, 960, 160, True, 'hard_swish', 1],
+            ]
+            self.cls_ch_squeeze = 960
+            self.cls_ch_expand = 1280
+        elif model_name == "small":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, 'relu', 2],
+                [3, 72, 24, False, 'relu', 2],
+                [3, 88, 24, False, 'relu', 1],
+                [5, 96, 40, True, 'hard_swish', 2],
+                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 120, 48, True, 'hard_swish', 1],
+                [5, 144, 48, True, 'hard_swish', 1],
+                [5, 288, 96, True, 'hard_swish', 2],
+                [5, 576, 96, True, 'hard_swish', 1],
+                [5, 576, 96, True, 'hard_swish', 1],
+            ]
+            self.cls_ch_squeeze = 576
+            self.cls_ch_expand = 1280
+        else:
+            raise NotImplementedError("mode[" + model_name +
+                                      "_model] is not implemented!")
+
+        # network
+        # conv1
+        self.conv1 = ConvBnLayer(
+            in_c=3,
+            out_c=make_divisible(inplanes * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            name=self.full_name() + 'conv1')
+        # conv_blocks
+        self.all_blocks = []
+        i = 0
+        inplanes = make_divisible(inplanes * scale)
+        for (k, exp, c, se, nl, s) in self.cfg:
+            self.all_blocks.append(
+                ResidualUnit(
+                    in_c=inplanes,
+                    mid_c=make_divisible(scale * exp),
+                    out_c=make_divisible(scale * c),
+                    filter_size=k,
+                    stride=s,
+                    ues_se=se,
+                    act=nl,
+                    name=self.full_name() + 'conv' + str(i + 2)))
+            self.add_sublayer(
+                sublayer=self.all_blocks[-1],
+                name=self.full_name() + 'conv' + str(i + 2))
+            inplanes = make_divisible(scale * c)
+            i += 1
+        # last_second_conv
+        self.last_second_conv = ConvBnLayer(
+            in_c=inplanes,
+            out_c=make_divisible(scale * self.cls_ch_squeeze),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            name=self.full_name() + 'last_second_conv')
+        # global_avg_pool
+        self.pool = fluid.dygraph.Pool2D(
+            pool_type='avg', global_pooling=True, use_cudnn=False)
+        # last_conv
+        self.last_conv = fluid.dygraph.Conv2D(
+            num_channels=make_divisible(scale * self.cls_ch_squeeze),
+            num_filters=self.cls_ch_expand,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            param_attr=ParamAttr(
+                name=self.full_name() + 'last_1x1_conv_weights'),
+            bias_attr=False)
+        # fc
+        self.fc = fluid.dygraph.Linear(
+            input_dim=self.cls_ch_expand,
+            output_dim=class_dim,
+            param_attr=ParamAttr(name=self.full_name() + 'fc_weights'))
+
+    def forward(self, inputs, label=None, dropout_prob=0.2):
+        x = self.conv1(inputs)
+        for i in range(len(self.all_blocks)):
+            x = self.all_blocks[i](x)
+        x = self.last_second_conv(x)
+        x = self.pool(x)
+        x = self.last_conv(x)
+        x = fluid.layers.hard_swish(x)
+        x = fluid.layers.dropout(x=x, dropout_prob=dropout_prob)
+        x = fluid.layers.reshape(x, shape=[x.shape[0], x.shape[1]])
+        x = self.fc(x)
+
+        if label:
+            acc1 = fluid.layers.accuracy(input=x, label=label)
+            acc5 = fluid.layers.accuracy(input=x, label=label, k=5)
+            return x, acc1, acc5
+        return x
+
+
+class ConvBnLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 if_act=True,
+                 act=None,
+                 use_cudnn=True,
+                 name=''):
+        super(ConvBnLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.conv = fluid.dygraph.Conv2D(
+            num_channels=in_c,
+            num_filters=out_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            use_cudnn=use_cudnn,
+            act=None)
+        self.bn = fluid.dygraph.BatchNorm(
+            num_channels=out_c,
+            act=None,
+            param_attr=ParamAttr(
+                name=name + "_bn" + "_scale",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0)),
+            bias_attr=ParamAttr(
+                name=name + "_bn" + "_offset",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0)),
+            moving_mean_name=name + "_bn" + '_mean',
+            moving_variance_name=name + "_bn" + '_variance')
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            if self.act == 'relu':
+                x = fluid.layers.relu(x)
+            elif self.act == 'hard_swish':
+                x = fluid.layers.hard_swish(x)
+            else:
+                print('The activation function is selected incorrectly.')
+                exit()
+        return x
+
+
+class ResidualUnit(fluid.dygraph.Layer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 ues_se,
+                 act=None,
+                 name=''):
+        super(ResidualUnit, self).__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.if_se = ues_se
+
+        self.expand_conv = ConvBnLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act,
+            name=name + '_expand')
+        self.bottleneck_conv = ConvBnLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            num_groups=mid_c,
+            if_act=True,
+            act=act,
+            name=name + '_depthwise')
+        if self.if_se:
+            self.mid_se = SEModule(mid_c, name=name + '_SE')
+        self.linear_conv = ConvBnLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None,
+            name=name + '_linear')
+
+    def forward(self, inputs):
+        x = self.expand_conv(inputs)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = fluid.layers.elementwise_add(inputs, x)
+        return x
+
+
+class SEModule(fluid.dygraph.Layer):
+    def __init__(self, channel, reduction=4, name=''):
+        super(SEModule, self).__init__()
+        self.avg_pool = fluid.dygraph.Pool2D(
+            pool_type='avg', global_pooling=True, use_cudnn=False)
+        self.conv1 = fluid.dygraph.Conv2D(
+            num_channels=channel,
+            num_filters=channel // reduction,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act='relu',
+            param_attr=ParamAttr(name=name + "_weights1"),
+            bias_attr=ParamAttr(name=name + "_bias1"))
+        self.conv2 = fluid.dygraph.Conv2D(
+            num_channels=channel // reduction,
+            num_filters=channel,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights2"),
+            bias_attr=ParamAttr(name=name + "_bias2"))
+
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = self.conv2(outputs)
+        outputs = fluid.layers.hard_sigmoid(outputs, slope=0.2, offset=0.5)
+        return fluid.layers.elementwise_mul(x=inputs, y=outputs, axis=0)
+
+
+if __name__ == "__main__":
+    import numpy as np
+    place = fluid.CPUPlace()
+    with fluid.dygraph.guard(place):
+        model = MobileNetV3_dy(scale=0.6, model_name='large', class_dim=1000)
+
+        img = np.random.uniform(0, 255, [8, 3, 224, 224]).astype('float32')
+        img = fluid.dygraph.to_variable(img)
+        res = model(img)
+        print(res.shape)
+    #     out_dygraph, static_layer = fluid.dygraph.TracedLayer.trace(model, inputs=[img])
+
+    #     out_static_graph = static_layer([img])
+    #     print(len(out_static_graph)) # 1
+    #     print(out_static_graph[0].shape) 
+
+    #     static_layer.save_inference_model(dirname='./saved_infer_model')
+
+    # from calc_flops import *
+    # place = fluid.CPUPlace()
+    # exe = fluid.Executor(place)
+    # program, feed_vars, fetch_vars = fluid.io.load_inference_model('./saved_infer_model', exe)
+
+    # total_flops_params, is_quantize = summary(program)
--- a/demo/once_for_all/dy_models/once_for_all_kernel.py
+++ b/demo/once_for_all/dy_models/once_for_all_kernel.py
+# coding:utf-8
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+import numpy as np
+
+from dy_models.func_conv import conv2d
+#from func_conv import conv2d
+import pdb
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class OFA_kernel(fluid.dygraph.Layer):
+    def __init__(self,
+                 scale=1.0,
+                 model_name='large',
+                 token=[],
+                 class_dim=1000,
+                 ofa_mode=None,
+                 trainable_besides_trans=False):
+        super(OFA_kernel, self).__init__()
+        # Parsing token
+        assert len(token) >= 45
+        self.kernel_size_lis = tuple(token[:20])
+        self.exp_lis = tuple(token[20:40])
+        self.depth_lis = tuple(token[40:45])
+
+        # Hyperparameter
+        self.scale = scale
+        self.inplanes = 16
+        self.class_dim = class_dim
+        self.ofa_mode = ofa_mode
+        self.trainable_besides_trans = trainable_besides_trans
+        if model_name == "large":
+            # The search space is the last five digits
+            self.cfg_channel = (16, 24, 40, 80, 112, 160)
+            self.cfg_stride = (1, 2, 2, 2, 1, 2)
+            self.cfg_se = (False, False, True, False, True, True)
+            self.cfg_act = ('relu', 'relu', 'relu', 'hard_swish', 'hard_swish',
+                            'hard_swish')
+            self.cls_ch_squeeze = 960
+            self.cls_ch_expand = 1280
+        else:
+            raise NotImplementedError("mode[" + model_name +
+                                      "_model] is not implemented!")
+
+        # conv1
+        self.conv1 = DynamicConvBnLayer(
+            in_c=3,
+            out_c=make_divisible(self.inplanes * self.scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            ofa_mode=None,
+            trainable_besides_trans=self.trainable_besides_trans,
+            name='conv1')
+        self.inplanes = make_divisible(self.inplanes * self.scale)
+
+        # conv2
+        num_mid_filter = make_divisible(self.inplanes * self.scale)
+        _num_out_filter = self.cfg_channel[0]
+        num_out_filter = make_divisible(self.scale * _num_out_filter)
+        self.conv2 = ResidualUnit(
+            in_c=self.inplanes,
+            mid_c=num_out_filter,
+            out_c=num_out_filter,
+            filter_size=3,
+            stride=self.cfg_stride[0],
+            use_se=self.cfg_se[0],
+            act=self.cfg_act[0],
+            ofa_mode=None,
+            trainable_besides_trans=self.trainable_besides_trans,
+            name='conv2')
+        self.inplanes = make_divisible(self.cfg_channel[0] * self.scale)
+
+        # conv_blocks
+        i = 3
+        self.conv_blocks = []
+        for depth_id in range(len(self.depth_lis)):
+            for repeat_time in range(self.depth_lis[depth_id]):
+                num_mid_filter = make_divisible(
+                    self.scale * _num_out_filter *
+                    self.exp_lis[depth_id * 4 + repeat_time])
+                _num_out_filter = self.cfg_channel[depth_id + 1]
+                num_out_filter = make_divisible(self.scale * _num_out_filter)
+                stride = self.cfg_stride[depth_id +
+                                         1] if repeat_time == 0 else 1
+                self.conv_blocks.append(
+                    ResidualUnit(
+                        in_c=self.inplanes,
+                        mid_c=num_mid_filter,
+                        out_c=num_out_filter,
+                        filter_size=self.kernel_size_lis[depth_id * 4 +
+                                                         repeat_time],
+                        stride=stride,
+                        use_se=self.cfg_se[depth_id + 1],
+                        act=self.cfg_act[depth_id + 1],
+                        ofa_mode=self.ofa_mode,
+                        trainable_besides_trans=self.trainable_besides_trans,
+                        name='conv' + str(i)))
+                self.add_sublayer(
+                    sublayer=self.conv_blocks[-1], name='conv' + str(i))
+                self.inplanes = make_divisible(self.scale *
+                                               self.cfg_channel[depth_id + 1])
+                i += 1
+
+        # last_second_conv
+        self.last_second_conv = DynamicConvBnLayer(
+            in_c=self.inplanes,
+            out_c=make_divisible(self.cls_ch_squeeze * self.scale),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            ofa_mode=None,
+            trainable_besides_trans=self.trainable_besides_trans,
+            name='last_second_conv')
+
+        # global_avg_pool
+        self.global_avg_pool = fluid.dygraph.Pool2D(
+            pool_type='avg', global_pooling=True, use_cudnn=False)
+
+        # last_conv
+        self.last_conv = fluid.dygraph.Conv2D(
+            num_channels=make_divisible(self.cls_ch_squeeze * self.scale),
+            num_filters=self.cls_ch_expand,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=None,
+            param_attr=ParamAttr(
+                name='last_conv_weights',
+                trainable=self.trainable_besides_trans),
+            bias_attr=ParamAttr(
+                name='last_conv_bias', trainable=self.trainable_besides_trans))
+
+        # fc
+        self.fc = fluid.dygraph.Linear(
+            input_dim=self.cls_ch_expand,
+            output_dim=self.class_dim,
+            param_attr=ParamAttr(
+                name='fc_weights', trainable=self.trainable_besides_trans),
+            bias_attr=ParamAttr(
+                name='fc_bias', trainable=self.trainable_besides_trans))
+
+    def forward(self,
+                inputs,
+                label=None,
+                dy_token=[],
+                dy_trainable=[],
+                dropout_prob=0.2):
+        #pdb.set_trace()
+        x = self.conv1(inputs)
+        x = self.conv2(x)
+        if self.ofa_mode == 'kernel':
+            assert len(dy_token) >= sum(self.depth_lis)
+            assert len(dy_trainable) >= sum(self.depth_lis)
+            count = 0
+            i = -1
+            for depth_id in range(len(self.depth_lis)):
+                for repeat_time in range(self.depth_lis[depth_id]):
+                    i += 1
+                    if self.kernel_size_lis[depth_id * 4 +
+                                            repeat_time] != dy_token[i]:
+                        count += 1
+            if count != 1:
+                # print('WARNING: According to the original paper, only one filter should be transformed in each iteration, but the current iteration transforms {} filter(s).'.format(count))
+                pass
+        else:
+            dy_token = [None for i in range(sum(self.depth_lis))]
+            dy_trainable = [True for i in range(sum(self.depth_lis))]
+        for i in range(len(self.conv_blocks)):
+            x = self.conv_blocks[i](x, dy_token[i], dy_trainable[i])
+        #pdb.set_trace()
+        x = self.last_second_conv(x)
+        x = self.global_avg_pool(x)
+        x = self.last_conv(x)
+        x = fluid.layers.hard_swish(x)
+        x = fluid.layers.dropout(x=x, dropout_prob=dropout_prob)
+        x = fluid.layers.squeeze(x, axes=[])
+        x = self.fc(x)
+
+        if label:
+            acc1 = fluid.layers.accuracy(input=x, label=label)
+            acc5 = fluid.layers.accuracy(input=x, label=label, k=5)
+            return x, acc1, acc5
+        return x
+
+
+class DynamicConvBnLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 if_act=True,
+                 act=None,
+                 use_cudnn=True,
+                 ofa_mode=None,
+                 trainable_besides_trans=True,
+                 name=''):
+        super(DynamicConvBnLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.filter_size = filter_size
+        self.stride = stride
+        self.num_groups = num_groups
+        self.ofa_mode = ofa_mode
+        self.trainable_besides_trans = trainable_besides_trans
+        self.use_cudnn = use_cudnn
+        self.name = name
+
+        self.conv = fluid.dygraph.Conv2D(
+            num_channels=in_c,
+            num_filters=out_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            param_attr=ParamAttr(
+                trainable=self.trainable_besides_trans, name=name + "_weights"),
+            bias_attr=False,
+            use_cudnn=use_cudnn,
+            act=None)
+        self.bn = fluid.dygraph.BatchNorm(
+            num_channels=out_c,
+            act=None,
+            param_attr=ParamAttr(
+                name=name + "_bn" + "_scale",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0),
+                trainable=self.trainable_besides_trans,
+                learning_rate=1.0 if self.trainable_besides_trans else 0.0),
+            bias_attr=ParamAttr(
+                name=name + "_bn" + "_offset",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0),
+                trainable=self.trainable_besides_trans,
+                learning_rate=1.0 if self.trainable_besides_trans else 0.0),
+            moving_mean_name=name + "_bn" + '_mean',
+            moving_variance_name=name + "_bn" + '_variance',
+            use_global_stats=True)
+
+        if not self.ofa_mode:
+            pass
+        elif self.ofa_mode == 'kernel':
+            if num_groups == 1:
+                raise RuntimeError(
+                    'OFA only supports depthwise convolution for kernel transformation operations.'
+                )
+            self.trans_block = []
+            if filter_size >= 5:
+                _init_np_array = np.eye(9)
+                self.trans_block.append(
+                    fluid.dygraph.Linear(
+                        input_dim=9,
+                        output_dim=9,
+                        param_attr=ParamAttr(
+                            initializer=fluid.initializer.NumpyArrayInitializer(
+                                _init_np_array),
+                            regularizer=fluid.regularizer.L2Decay(0.0),
+                            name=name + "_transLinear_5to3"),
+                        bias_attr=False))
+                self.add_sublayer(
+                    sublayer=self.trans_block[-1],
+                    name=name + "_transLinear_5to3")
+                self.bn_3x3 = fluid.dygraph.BatchNorm(
+                    num_channels=out_c,
+                    act=None,
+                    param_attr=ParamAttr(
+                        name=name + "_bn_3x3" + "_scale",
+                        regularizer=fluid.regularizer.L2DecayRegularizer(
+                            regularization_coeff=0.0)),
+                    bias_attr=ParamAttr(
+                        name=name + "_bn_3x3" + "_offset",
+                        regularizer=fluid.regularizer.L2DecayRegularizer(
+                            regularization_coeff=0.0)),
+                    moving_mean_name=name + "_bn_3x3" + '_mean',
+                    moving_variance_name=name + "_bn_3x3" + '_variance')
+            if filter_size >= 7:
+                _init_np_array = np.eye(25)
+                self.trans_block.insert(
+                    0,
+                    fluid.dygraph.Linear(
+                        input_dim=25,
+                        output_dim=25,
+                        param_attr=ParamAttr(
+                            initializer=fluid.initializer.NumpyArrayInitializer(
+                                _init_np_array),
+                            regularizer=fluid.regularizer.L2Decay(0.0),
+                            name=name + "_transLinear_7to5"),
+                        bias_attr=False))
+                self.add_sublayer(
+                    sublayer=self.trans_block[0],
+                    name=name + "_transLinear_7to5")
+                self.bn_5x5 = fluid.dygraph.BatchNorm(
+                    num_channels=out_c,
+                    act=None,
+                    param_attr=ParamAttr(
+                        name=name + "_bn_5x5" + "_scale",
+                        regularizer=fluid.regularizer.L2DecayRegularizer(
+                            regularization_coeff=0.0)),
+                    bias_attr=ParamAttr(
+                        name=name + "_bn_5x5" + "_offset",
+                        regularizer=fluid.regularizer.L2DecayRegularizer(
+                            regularization_coeff=0.0)),
+                    moving_mean_name=name + "_bn_5x5" + '_mean',
+                    moving_variance_name=name + "_bn_5x5" + '_variance')
+        else:
+            raise NotImplementedError("OFA_mode [" + ofa_mode +
+                                      "] is not implemented!")
+
+    def forward(self, x, dy_filter_size=None, dy_trans_trainable=True):
+        if self.ofa_mode == 'kernel':
+            for i in range(len(self.trans_block)):
+                self.trans_block[i].weight.trainable = dy_trans_trainable
+            # print(self.trans_block[0].weight.trainable)
+            assert dy_filter_size in [3, 5, 7, None]
+            if dy_filter_size is None or dy_filter_size == self.filter_size:
+                x = self.conv(x)
+            elif dy_filter_size > self.filter_size:
+                raise RuntimeError(
+                    'The new filter size should be less than or equal to the size of the original model.'
+                )
+            else:
+                kernel_weight_np = self.conv.weight.numpy().copy()
+                kernel_weight = fluid.dygraph.to_variable(
+                    kernel_weight_np, zero_copy=False)
+                kernel_weight.stop_gradient = False
+                _batch_size, _channel = kernel_weight.shape[
+                    0], kernel_weight.shape[1]
+                i = 0
+                while dy_filter_size < kernel_weight.shape[-1]:
+                    kernel_weight = kernel_weight[:, :, 1:-1, 1:-1]
+                    kernel_weight = fluid.layers.reshape(
+                        kernel_weight, [_batch_size, _channel, -1])
+                    kernel_weight = fluid.layers.reshape(
+                        kernel_weight, [-1, kernel_weight.shape[-1]])
+                    kernel_weight = self.trans_block[i](kernel_weight)
+                    kernel_weight = fluid.layers.reshape(
+                        kernel_weight, [_batch_size, _channel, -1])
+                    _new_filter_size = int(kernel_weight.shape[-1]**0.5)
+                    kernel_weight = fluid.layers.reshape(kernel_weight, [
+                        _batch_size, _channel, _new_filter_size,
+                        _new_filter_size
+                    ])
+                    i += 1
+
+                x = conv2d(
+                    x,
+                    weight=kernel_weight,
+                    bias=None,
+                    padding=int((dy_filter_size - 1) // 2),
+                    stride=self.stride,
+                    dilation=1,
+                    groups=self.num_groups,
+                    use_cudnn=self.use_cudnn,
+                    act=None,
+                    data_format='NCHW',
+                    name=self.name + '_transConv')
+
+                if self.filter_size == dy_filter_size:
+                    x = self.bn(x)
+                elif dy_filter_size == 5:
+                    x = self.bn_5x5(x)
+                elif dy_filter_size == 3:
+                    x = self.bn_3x3(x)
+                else:
+                    print('ERROR')
+                    exit()
+        else:
+            x = self.conv(x)
+            x = self.bn(x)
+        if self.if_act:
+            if self.act == 'relu':
+                x = fluid.layers.relu(x)
+            elif self.act == 'hard_swish':
+                x = fluid.layers.hard_swish(x)
+            elif self.act == 'mish':
+                x = x * fluid.layers.tanh(fluid.layers.softplus(x))
+            else:
+                print('The activation function is selected incorrectly.')
+                exit()
+        return x
+
+
+class ResidualUnit(fluid.dygraph.Layer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 act=None,
+                 ofa_mode=None,
+                 trainable_besides_trans=True,
+                 name=''):
+        super(ResidualUnit, self).__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.if_se = use_se
+
+        self.expand_conv = DynamicConvBnLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act,
+            ofa_mode=None,
+            trainable_besides_trans=trainable_besides_trans,
+            name=name + '_expand')
+        self.bottleneck_conv = DynamicConvBnLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            num_groups=mid_c,
+            if_act=True,
+            act=act,
+            use_cudnn=False,
+            ofa_mode=ofa_mode,
+            trainable_besides_trans=trainable_besides_trans,
+            name=name + '_depthwise')
+        if self.if_se:
+            self.mid_se = SEModule(
+                mid_c,
+                trainable_besides_trans=trainable_besides_trans,
+                name=name + '_SE')
+        self.linear_conv = DynamicConvBnLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None,
+            ofa_mode=None,
+            trainable_besides_trans=trainable_besides_trans,
+            name=name + '_linear')
+
+    def forward(self, inputs, dy_filter_size=None, dy_trans_trainable=True):
+        x = self.expand_conv(inputs)
+        x = self.bottleneck_conv(x, dy_filter_size, dy_trans_trainable)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = fluid.layers.elementwise_add(inputs, x)
+        return x
+
+
+class SEModule(fluid.dygraph.Layer):
+    def __init__(self,
+                 channel,
+                 reduction=4,
+                 trainable_besides_trans=True,
+                 name=''):
+        super(SEModule, self).__init__()
+        self.avg_pool = fluid.dygraph.Pool2D(
+            pool_type='avg', global_pooling=True, use_cudnn=False)
+        self.conv1 = fluid.dygraph.Conv2D(
+            num_channels=channel,
+            num_filters=channel // reduction,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act='relu',
+            param_attr=ParamAttr(
+                name=name + "_weights1", trainable=trainable_besides_trans),
+            bias_attr=ParamAttr(
+                name=name + "_bias1", trainable=trainable_besides_trans))
+        self.conv2 = fluid.dygraph.Conv2D(
+            num_channels=channel // reduction,
+            num_filters=channel,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            param_attr=ParamAttr(
+                name=name + "_weights2", trainable=trainable_besides_trans),
+            bias_attr=ParamAttr(
+                name=name + "_bias2", trainable=trainable_besides_trans))
+
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = self.conv2(outputs)
+        outputs = fluid.layers.hard_sigmoid(outputs, slope=0.2, offset=0.5)
+        return fluid.layers.elementwise_mul(x=inputs, y=outputs, axis=0)
+
+
+if __name__ == "__main__":
+    # from calc_flops import summary
+    import numpy as np
+    place = fluid.CPUPlace()
+    with fluid.dygraph.guard(place):
+        model = OFA_kernel(
+            scale=1.0,
+            model_name='large',
+            token=[
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3,
+                3, 3, 3, 3, 4, 3, 4, 3, 6, 4, 4, 4, 4, 6, 3, 6, 4, 6, 3, 2, 3,
+                3, 3, 4
+            ],
+            class_dim=1000,
+            ofa_mode='kernel',
+            trainable_besides_trans=False)
+
+        img = np.random.uniform(0, 255, [8, 3, 224, 224]).astype('float32')
+        img = fluid.dygraph.to_variable(img)
+        token = [7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 7, 5, 7]
+        trainable = [
+            False, False, False, False, False, False, False, False, False,
+            False, False, True, False, False, False
+        ]
+        res = model(img, dy_token=token, dy_trainable=trainable)
+        print(res.shape)
--- a/demo/once_for_all/reader.py
+++ b/demo/once_for_all/reader.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import sys
+import os
+import math
+import random
+import functools
+import numpy as np
+import cv2
+
+import paddle
+from paddle import fluid
+from utils.autoaugment import ImageNetPolicy
+from PIL import Image
+
+policy = None
+
+random.seed(0)
+np.random.seed(0)
+
+
+def rotate_image(img):
+    """rotate image
+    Args:
+        img: image data
+    Returns:
+        rotated image data
+    """
+    (h, w) = img.shape[:2]
+    center = (w / 2, h / 2)
+    angle = np.random.randint(-10, 11)
+    M = cv2.getRotationMatrix2D(center, angle, 1.0)
+    rotated = cv2.warpAffine(img, M, (w, h))
+    return rotated
+
+
+def random_crop(img, size, settings, scale=None, ratio=None,
+                interpolation=None):
+    """random crop image
+        
+    Args:
+        img: image data
+        size: crop size
+        settings: arguments
+        scale: scale parameter
+        ratio: ratio parameter
+    Returns:
+        random cropped image data
+    """
+    lower_scale = settings.lower_scale
+    lower_ratio = settings.lower_ratio
+    upper_ratio = settings.upper_ratio
+    scale = [lower_scale, 1.0] if scale is None else scale
+    ratio = [lower_ratio, upper_ratio] if ratio is None else ratio
+
+    aspect_ratio = math.sqrt(np.random.uniform(*ratio))
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+
+    bound = min((float(img.shape[0]) / img.shape[1]) / (h**2),
+                (float(img.shape[1]) / img.shape[0]) / (w**2))
+
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+
+    target_area = img.shape[0] * img.shape[1] * np.random.uniform(scale_min,
+                                                                  scale_max)
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+
+    i = np.random.randint(0, img.shape[0] - h + 1)
+    j = np.random.randint(0, img.shape[1] - w + 1)
+    img = img[i:i + h, j:j + w, :]
+
+    if interpolation:
+        resized = cv2.resize(img, (size, size), interpolation=interpolation)
+    else:
+        resized = cv2.resize(img, (size, size))
+    return resized
+
+
+#NOTE:(2019/08/08) distort color func is not implemented
+def distort_color(img):
+    """distort image color
+    Args:
+        img: image data
+    Returns:
+        distorted color image data
+    """
+    return img
+
+
+def resize_short(img, target_size, interpolation=None):
+    """resize image
+    
+    Args:
+        img: image data
+        target_size: resize short target size
+        interpolation: interpolation mode
+    Returns:
+        resized image data
+    """
+    percent = float(target_size) / min(img.shape[0], img.shape[1])
+    resized_width = int(round(img.shape[1] * percent))
+    resized_height = int(round(img.shape[0] * percent))
+    if interpolation:
+        resized = cv2.resize(
+            img, (resized_width, resized_height), interpolation=interpolation)
+    else:
+        resized = cv2.resize(img, (resized_width, resized_height))
+    return resized
+
+
+def crop_image(img, target_size, center):
+    """crop image 
+    
+    Args:
+        img: images data
+        target_size: crop target size
+        center: crop mode
+    
+    Returns:
+        img: cropped image data
+    """
+    height, width = img.shape[:2]
+    size = target_size
+    if center == True:
+        w_start = (width - size) // 2
+        h_start = (height - size) // 2
+    else:
+        w_start = np.random.randint(0, width - size + 1)
+        h_start = np.random.randint(0, height - size + 1)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img[h_start:h_end, w_start:w_end, :]
+    return img
+
+
+def create_mixup_reader(settings, rd):
+    """
+    """
+
+    class context:
+        tmp_mix = []
+        tmp_l1 = []
+        tmp_l2 = []
+        tmp_lam = []
+
+    alpha = settings.mixup_alpha
+
+    def fetch_data():
+        for item in rd():
+            yield item
+
+    def mixup_data():
+        for data_list in fetch_data():
+            if alpha > 0.:
+                lam = np.random.beta(alpha, alpha)
+            else:
+                lam = 1.
+            l1 = np.array(data_list)
+            l2 = np.random.permutation(l1)
+            mixed_l = [
+                l1[i][0] * lam + (1 - lam) * l2[i][0] for i in range(len(l1))
+            ]
+            yield (mixed_l, l1, l2, lam)
+
+    def mixup_reader():
+        for context.tmp_mix, context.tmp_l1, context.tmp_l2, context.tmp_lam in mixup_data(
+        ):
+            for i in range(len(context.tmp_mix)):
+                mixed_l = context.tmp_mix[i]
+                l1 = context.tmp_l1[i]
+                l2 = context.tmp_l2[i]
+                lam = context.tmp_lam
+                yield (mixed_l, int(l1[1]), int(l2[1]), float(lam))
+
+    return mixup_reader
+
+
+def process_image(sample, settings, mode, color_jitter, rotate):
+    """ process_image """
+
+    mean = settings.image_mean
+    std = settings.image_std
+    crop_size = settings.crop_size
+
+    img_path = sample[0]
+    img = cv2.imread(img_path)
+
+    if mode == 'train':
+        if rotate:
+            img = rotate_image(img)
+        if crop_size > 0:
+            img = random_crop(
+                img, crop_size, settings, interpolation=settings.interpolation)
+        if color_jitter:
+            img = distort_color(img)
+        if np.random.randint(0, 2) == 1:
+            img = img[:, ::-1, :]
+    else:
+        if crop_size > 0:
+            target_size = settings.resize_short_size
+            img = resize_short(
+                img, target_size, interpolation=settings.interpolation)
+            img = crop_image(img, target_size=crop_size, center=True)
+
+    img = img[:, :, ::-1]
+
+    if 'use_aa' in settings and settings.use_aa and mode == 'train':
+        img = np.ascontiguousarray(img)
+        img = Image.fromarray(img)
+        img = policy(img)
+        img = np.asarray(img)
+
+    img = img.astype('float32').transpose((2, 0, 1)) / 255
+    img_mean = np.array(mean).reshape((3, 1, 1))
+    img_std = np.array(std).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+
+    if mode == 'train' or mode == 'val':
+        return (img, sample[1])
+    elif mode == 'test':
+        return (img, )
+
+
+def process_batch_data(input_data, settings, mode, color_jitter, rotate):
+    batch_data = []
+    for sample in input_data:
+        if os.path.isfile(sample[0]):
+            batch_data.append(
+                process_image(sample, settings, mode, color_jitter, rotate))
+        else:
+            print("File not exist : %s" % sample[0])
+    return batch_data
+
+
+class ImageNetReader:
+    def __init__(self, seed=None, place_num=1):
+        self.shuffle_seed = seed
+        self.place_num = place_num
+
+    def set_shuffle_seed(self, seed):
+        assert isinstance(seed, int), "shuffle seed must be int"
+        self.shuffle_seed = seed
+
+    def _reader_creator(self,
+                        settings,
+                        file_list,
+                        mode,
+                        shuffle=False,
+                        color_jitter=False,
+                        rotate=False,
+                        data_dir=None):
+        num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+        if mode == 'test':
+            batch_size = 1
+        else:
+            batch_size = settings.batch_size / self.place_num
+
+        def reader():
+            def read_file_list():
+                with open(file_list) as flist:
+                    full_lines = [line.strip() for line in flist]
+                    if mode != "test" and len(full_lines) < settings.batch_size:
+                        print(
+                            "Warning: The number of the whole data ({}) is smaller than the batch_size ({}), and drop_last is turnning on, so nothing  will feed in program, Terminated now. Please reset batch_size to a smaller number or feed more data!"
+                            .format(len(full_lines), settings.batch_size))
+                        os._exit(1)
+                    if num_trainers > 1 and mode == "train":
+                        assert self.shuffle_seed is not None, "multiprocess train, shuffle seed must be set!"
+                        np.random.RandomState(self.shuffle_seed).shuffle(
+                            full_lines)
+                    elif shuffle:
+                        assert self.shuffle_seed is not None, "multiprocess train, shuffle seed must be set!"
+                        np.random.RandomState(self.shuffle_seed).shuffle(
+                            full_lines)
+
+                batch_data = []
+                for line in full_lines:
+                    img_path, label = line.split()
+                    img_path = os.path.join(data_dir, img_path)
+                    batch_data.append([img_path, int(label)])
+                    if len(batch_data) == batch_size:
+                        if mode == 'train' or mode == 'val' or mode == 'test':
+                            yield batch_data
+
+                        batch_data = []
+
+            return read_file_list
+
+        data_reader = reader()
+        if mode == 'train' and num_trainers > 1:
+            assert self.shuffle_seed is not None, \
+                "If num_trainers > 1, the shuffle_seed must be set, because " \
+                "the order of batch data generated by reader " \
+                "must be the same in the respective processes."
+            data_reader = paddle.fluid.contrib.reader.distributed_batch_reader(
+                data_reader)
+
+        mapper = functools.partial(
+            process_batch_data,
+            settings=settings,
+            mode=mode,
+            color_jitter=color_jitter,
+            rotate=rotate)
+
+        ret = fluid.io.xmap_readers(
+            mapper,
+            data_reader,
+            settings.reader_thread,
+            settings.reader_buf_size,
+            order=False)
+
+        return ret
+
+    def train(self, settings):
+        """Create a reader for trainning
+        Args:
+            settings: arguments
+        Returns:
+            train reader
+        """
+        file_list = os.path.join(settings.data_dir, 'train_list.txt')
+        assert os.path.isfile(
+            file_list), "{} doesn't exist, please check data list path".format(
+                file_list)
+
+        if 'use_aa' in settings and settings.use_aa:
+            global policy
+            policy = ImageNetPolicy()
+
+        reader = self._reader_creator(
+            settings,
+            file_list,
+            'train',
+            shuffle=True,
+            color_jitter=False,
+            rotate=False,
+            data_dir=settings.data_dir)
+
+        if settings.use_mixup == True:
+            reader = create_mixup_reader(settings, reader)
+            reader = fluid.io.batch(
+                reader,
+                batch_size=int(settings.batch_size / self.place_num),
+                drop_last=True)
+        return reader
+
+    def val(self, settings):
+        """Create a reader for eval
+        Args:
+            settings: arguments
+        Returns:
+            eval reader
+        """
+        file_list = os.path.join(settings.data_dir, 'val_list.txt')
+
+        assert os.path.isfile(
+            file_list), "{} doesn't exist, please check data list path".format(
+                file_list)
+
+        return self._reader_creator(
+            settings,
+            file_list,
+            'val',
+            shuffle=False,
+            data_dir=settings.data_dir)
+
+    def test(self, settings):
+        """Create a reader for testing
+        Args:
+            settings: arguments
+        Returns:
+            test reader
+        """
+        file_list = os.path.join(settings.data_dir, 'val_list.txt')
+
+        assert os.path.isfile(
+            file_list), "{} doesn't exist, please check data list path".format(
+                file_list)
+        return self._reader_creator(
+            settings,
+            file_list,
+            'test',
+            shuffle=False,
+            data_dir=settings.data_dir)
--- a/demo/once_for_all/start.sh
+++ b/demo/once_for_all/start.sh
+export PYTHONPATH=$PWD:$PYTHONPATH
+
+python -m paddle.distributed.launch --log_dir my_log train.py \
+    --use_data_parallel=True \
+    --batch_size=2048 \
+    --lr=1e-3 \
+    --l2_decay=3e-5 \
+    --total_images=1281167 \
+    --class_dim=1000 \
+    --image_shape=3,224,224 \
+    --model_save_dir=output.ofa.kernel \
+    --lr_strategy=piecewise_decay \
+    --num_epochs=360 \
+    --data_dir=./data/ILSVRC2012 \
+    --model=once_for_all_kernel \
+    --use_aa=False \
+    --checkpoint=./_ofa_epoch359
--- a/demo/once_for_all/train.py
+++ b/demo/once_for_all/train.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import time
+import sys
+import math
+import argparse
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid import framework
+import reader
+from utils import *
+from dy_models import MobileNetV1, MobileNetV2, MobileNetV3_dy, OFA_kernel
+
+args = parse_args()
+if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
+    print_arguments(args)
+
+
+def dy_token_gene(ori_token, alter=[3, 5, 7], num_of_changes=1):
+    kernel_token = tuple(ori_token[:20])
+    exp_token = tuple(ori_token[20:40])
+    depth_token = tuple(ori_token[40:45])
+    dy_token = []
+    for depth_id in range(len(depth_token)):
+        for repeat_time in range(depth_token[depth_id]):
+            dy_token.append(kernel_token[depth_id * 4 + repeat_time])
+    idx_lis = np.random.choice(len(dy_token), num_of_changes, replace=False)
+    for idx in idx_lis:
+        while True:
+            tmp_kernel = alter[np.random.choice(len(alter))]
+            if tmp_kernel != dy_token[idx]:
+                dy_token[idx] = tmp_kernel
+                break
+    return dy_token
+
+
+def dy_token_gene_v2(ori_token, alter=[3, 5, 7], position_from_end=-1):
+    kernel_token = tuple(ori_token[:20])
+    exp_token = tuple(ori_token[20:40])
+    depth_token = tuple(ori_token[40:45])
+    dy_token = []
+    dy_trainable = []
+    for depth_id in range(len(depth_token)):
+        for repeat_time in range(depth_token[depth_id]):
+            dy_token.append(kernel_token[depth_id * 4 + repeat_time])
+            dy_trainable.append(False)
+
+    dy_token[position_from_end] = alter[0]
+    dy_trainable[position_from_end] = True
+
+    for i in range(abs(position_from_end) - 1):
+        dy_token[position_from_end + i + 1] = alter[np.random.choice(
+            len(alter))]
+    return dy_token, dy_trainable
+
+
+def eval(net, test_data_loader, eop, position_from_end=1):
+    total_loss = 0.0
+    total_acc1 = 0.0
+    total_acc5 = 0.0
+    total_sample = 0
+    t_last = 0
+    token = [
+        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3,
+        3, 4, 3, 4, 3, 6, 4, 4, 4, 4, 6, 3, 6, 4, 6, 3, 2, 3, 3, 3, 4
+    ]
+    place_num = paddle.fluid.core.get_cuda_device_count(
+    ) if args.use_gpu else int(os.environ.get('CPU_NUM', 1))
+    for i, (img, label) in enumerate(test_data_loader()):
+        t1 = time.time()
+        label = to_variable(label.numpy().astype('int64').reshape(
+            int(args.batch_size // place_num), 1))
+        if args.model == "once_for_all_kernel":
+            dy_token, dy_trainable = dy_token_gene_v2(
+                token, alter=[5, 7], position_from_end=position_from_end)
+            print(dy_token)
+            out = net(img, dy_token=dy_token, dy_trainable=dy_trainable)
+        else:
+            out = net(img)
+        softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+        loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
+        avg_loss = fluid.layers.mean(x=loss)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        t2 = time.time()
+        print( "test | epoch id: %d, batch id: %d, avg_loss %0.5f acc_top1 %0.5f acc_top5 %0.5f %2.4f sec read_t:%2.4f" % \
+                (eop, i, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), t2 - t1 , t1 - t_last))
+        sys.stdout.flush()
+        total_loss += avg_loss.numpy()
+        total_acc1 += acc_top1.numpy()
+        total_acc5 += acc_top5.numpy()
+        total_sample += 1
+        t_last = time.time()
+    print("final eval loss %0.3f acc1 %0.3f acc5 %0.3f" % \
+          (total_loss / total_sample, \
+           total_acc1 / total_sample, total_acc5 / total_sample))
+    sys.stdout.flush()
+
+
+def train_mobilenet():
+    if not args.use_gpu:
+        place = fluid.CPUPlace()
+    elif not args.use_data_parallel:
+        place = fluid.CUDAPlace(0)
+    else:
+        place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
+    with fluid.dygraph.guard(place):
+        # 1. init net and optimizer
+        place_num = paddle.fluid.core.get_cuda_device_count(
+        ) if args.use_gpu else int(os.environ.get('CPU_NUM', 1))
+        if args.ce:
+            print("ce mode")
+            seed = 33
+            np.random.seed(seed)
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+        if args.use_data_parallel:
+            strategy = fluid.dygraph.parallel.prepare_context()
+
+        if args.model == "MobileNetV1":
+            net = MobileNetV1(class_dim=args.class_dim, scale=1.0)
+            model_path_pre = 'mobilenet_v1'
+        elif args.model == "MobileNetV2":
+            net = MobileNetV2(class_dim=args.class_dim, scale=1.0)
+            model_path_pre = 'mobilenet_v2'
+        elif args.model == "MobileNetV3":
+            net = MobileNetV3_dy(
+                class_dim=args.class_dim, scale=1.0, model_name='large')
+            model_path_pre = 'mobilenet_v3'
+        elif args.model == "once_for_all":
+            token = [
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3,
+                3, 3, 3, 3, 4, 3, 4, 3, 6, 4, 4, 4, 4, 6, 3, 6, 4, 6, 3, 2, 3,
+                3, 3, 4
+            ]
+            net = OFA_kernel(
+                class_dim=args.class_dim,
+                scale=1.0,
+                model_name='large',
+                token=token,
+                ofa_mode=False)
+            model_path_pre = 'ofa'
+        elif args.model == "once_for_all_kernel":
+            token = [
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3,
+                3, 3, 3, 3, 4, 3, 4, 3, 6, 4, 4, 4, 4, 6, 3, 6, 4, 6, 3, 2, 3,
+                3, 3, 4
+            ]
+            net = OFA_kernel(
+                class_dim=args.class_dim,
+                scale=1.0,
+                model_name='large',
+                token=token,
+                ofa_mode='kernel',
+                trainable_besides_trans=False)
+            model_path_pre = 'ofa_kernel'
+        else:
+            print(
+                "wrong model name, please try model = MobileNetV1 or MobileNetV2"
+            )
+            exit()
+
+        optimizer = create_optimizer(args=args, parameter_list=net.parameters())
+        if args.use_data_parallel:
+            net = fluid.dygraph.parallel.DataParallel(net, strategy)
+
+        # 2. load checkpoint
+        if args.checkpoint:
+            assert os.path.exists(args.checkpoint + ".pdparams"), \
+                "Given dir {}.pdparams not exist.".format(args.checkpoint)
+            assert os.path.exists(args.checkpoint + ".pdopt"), \
+                "Given dir {}.pdopt not exist.".format(args.checkpoint)
+            para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint)
+
+            if args.model == "once_for_all_kernel":
+                inner_state_dict = net.state_dict()
+                for name, para in inner_state_dict.items():
+                    key_name = name
+                    if key_name in para_dict:
+                        print(key_name)
+                        para.set_value(para_dict[key_name])
+            else:
+                net.set_dict(para_dict)
+                optimizer.set_dict(opti_dict)
+
+        # 3. reader
+        train_data_loader, train_data = utility.create_data_loader(
+            is_train=True, args=args)
+        test_data_loader, test_data = utility.create_data_loader(
+            is_train=False, args=args)
+        num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+        imagenet_reader = reader.ImageNetReader(seed=0, place_num=place_num)
+        train_reader = imagenet_reader.train(settings=args)
+        test_reader = imagenet_reader.val(settings=args)
+        train_data_loader.set_sample_list_generator(train_reader, place)
+        test_data_loader.set_sample_list_generator(test_reader, place)
+        with open('./dy_token.txt', 'r+') as f:
+            lines = f.readlines()
+
+        eval_sign = False
+        if eval_sign:
+            net.eval()
+            eval(net, test_data_loader, 1, -1)
+            exit()
+
+        # 4. train loop
+        total_batch_num = 0  #this is for benchmark
+        # Hyper-Params
+        total_epoch_each_layer = 15
+        fixed_epoch_each_layer = 10
+        position = 0
+        count = 0
+        for eop in range(args.num_epochs):
+            if num_trainers > 1:
+                imagenet_reader.set_shuffle_seed(eop + (
+                    args.random_seed if args.random_seed else 0))
+            net.train()
+            total_loss = 0.0
+            total_acc1 = 0.0
+            total_acc5 = 0.0
+            total_sample = 0
+            batch_id = 0
+            t_last = 0
+
+            if eop % total_epoch_each_layer == 0:
+                position += 1
+                count = 0
+                with open(
+                        os.path.join('./token', str(position) + '.txt'),
+                        'r+') as f:
+                    lines = f.readlines()
+            count += 1
+
+            # 4.1 for each batch, call net() , backward(), and minimize()
+            for idx, (img, label) in enumerate(train_data_loader()):
+                t1 = time.time()
+                if args.max_iter and total_batch_num == args.max_iter:
+                    return
+                label = to_variable(label.numpy().astype('int64').reshape(
+                    int(args.batch_size // place_num), 1))
+                t_start = time.time()
+
+                # 4.1.1 call net()
+                if args.model == "once_for_all_kernel":
+                    dy_token = [int(x) for x in lines[idx].strip().split(',')]
+                    if count <= fixed_epoch_each_layer:
+                        dy_trainable = [False for _ in range(len(dy_token))]
+                        dy_trainable[0 - position] = True
+                    else:
+                        dy_trainable = [False] * (len(dy_token) - position
+                                                  ) + [True] * position
+                    if idx < 10:
+                        print(dy_token)
+                        print(dy_trainable)
+                    out = net(img, dy_token=dy_token, dy_trainable=dy_trainable)
+                else:
+                    out = net(img)
+
+                t_end = time.time()
+                softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+                loss = fluid.layers.cross_entropy(
+                    input=softmax_out, label=label)
+                avg_loss = fluid.layers.mean(x=loss)
+                acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+                acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+                t_start_back = time.time()
+
+                # 4.1.2 call backward()
+                if args.use_data_parallel:
+                    avg_loss = net.scale_loss(avg_loss)
+                    avg_loss.backward()
+                    net.apply_collective_grads()
+                else:
+                    avg_loss.backward()
+
+                t_end_back = time.time()
+
+                # 4.1.3 call minimize()
+                optimizer.minimize(avg_loss)
+
+                net.clear_gradients()
+                t2 = time.time()
+                train_batch_elapse = t2 - t1
+                if batch_id % args.print_step == 0:
+                    print( "epoch id: %d, batch step: %d, avg_loss %0.5f, acc_top1 %0.5f, acc_top5 %0.5f, lr %f, %2.4f sec, net_t:%2.4f, back_t:%2.4f, read_t:%2.4f" % \
+                            (eop, batch_id, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), optimizer._global_learning_rate().numpy(), train_batch_elapse,
+                              t_end - t_start, t_end_back - t_start_back,  t1 - t_last))
+                    sys.stdout.flush()
+                total_loss += avg_loss.numpy()
+                total_acc1 += acc_top1.numpy()
+                total_acc5 += acc_top5.numpy()
+                total_sample += 1
+                batch_id += 1
+                t_last = time.time()
+
+                # NOTE: used for benchmark
+                total_batch_num = total_batch_num + 1
+
+            if args.ce:
+                print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample))
+                print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample))
+                print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample))
+            print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \
+                  (eop, batch_id, total_loss / total_sample, \
+                   total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse))
+
+            # 4.2 save checkpoint
+            save_parameters = (not args.use_data_parallel) or (
+                args.use_data_parallel and
+                fluid.dygraph.parallel.Env().local_rank == 0)
+            if save_parameters:
+                if not os.path.isdir(args.model_save_dir):
+                    os.makedirs(args.model_save_dir)
+                model_path = os.path.join(
+                    args.model_save_dir,
+                    "_" + model_path_pre + "_epoch{}".format(eop))
+                fluid.dygraph.save_dygraph(net.state_dict(), model_path)
+                fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path)
+
+            # 4.3 validation
+            net.eval()
+            eval(net, test_data_loader, eop, 0 - position)
+
+        # 5. save final results
+        save_parameters = (not args.use_data_parallel) or (
+            args.use_data_parallel and
+            fluid.dygraph.parallel.Env().local_rank == 0)
+        if save_parameters:
+            model_path = os.path.join(args.model_save_dir,
+                                      "_" + model_path_pre + "_final")
+            fluid.dygraph.save_dygraph(net.state_dict(), model_path)
+
+
+if __name__ == '__main__':
+
+    train_mobilenet()
--- a/demo/once_for_all/utils/__init__.py
+++ b/demo/once_for_all/utils/__init__.py
+from .optimizer import Optimizer, create_optimizer
+from .utility import add_arguments, print_arguments, parse_args, check_gpu, check_args, check_version, init_model, save_model, create_data_loader, print_info, best_strategy_compiled, init_model, save_model, ExponentialMovingAverage
\ No newline at end of file
--- a/demo/once_for_all/utils/autoaugment.py
+++ b/demo/once_for_all/utils/autoaugment.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+"""
+This code is based on https://github.com/DeepVoltaire/AutoAugment/blob/master/autoaugment.py
+"""
+from PIL import Image, ImageEnhance, ImageOps
+import numpy as np
+import random
+
+
+class ImageNetPolicy(object):
+    """ Randomly choose one of the best 24 Sub-policies on ImageNet.
+        Example:
+        >>> policy = ImageNetPolicy()
+        >>> transformed = policy(image)
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     ImageNetPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.4, "posterize", 8, 0.6, "rotate", 9, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "posterize", 7, 0.6, "posterize", 6, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.4, "equalize", 4, 0.8, "rotate", 8, fillcolor),
+            SubPolicy(0.6, "solarize", 3, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.8, "posterize", 5, 1.0, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "rotate", 3, 0.6, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "equalize", 8, 0.4, "posterize", 6, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 0.4, "color", 0, fillcolor),
+            SubPolicy(0.4, "rotate", 9, 0.6, "equalize", 2, fillcolor),
+            SubPolicy(0.0, "equalize", 7, 0.8, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 1.0, "color", 2, fillcolor),
+            SubPolicy(0.8, "color", 8, 0.8, "solarize", 7, fillcolor),
+            SubPolicy(0.4, "sharpness", 7, 0.6, "invert", 8, fillcolor),
+            SubPolicy(0.6, "shearX", 5, 1.0, "equalize", 9, fillcolor),
+            SubPolicy(0.4, "color", 0, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment ImageNet Policy"
+
+
+class CIFAR10Policy(object):
+    """ Randomly choose one of the best 25 Sub-policies on CIFAR10.
+        Example:
+        >>> policy = CIFAR10Policy()
+        >>> transformed = policy(image)
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     CIFAR10Policy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.1, "invert", 7, 0.2, "contrast", 6, fillcolor),
+            SubPolicy(0.7, "rotate", 2, 0.3, "translateX", 9, fillcolor),
+            SubPolicy(0.8, "sharpness", 1, 0.9, "sharpness", 3, fillcolor),
+            SubPolicy(0.5, "shearY", 8, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.5, "autocontrast", 8, 0.9, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "shearY", 7, 0.3, "posterize", 7, fillcolor),
+            SubPolicy(0.4, "color", 3, 0.6, "brightness", 7, fillcolor),
+            SubPolicy(0.3, "sharpness", 9, 0.7, "brightness", 9, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.5, "equalize", 1, fillcolor),
+            SubPolicy(0.6, "contrast", 7, 0.6, "sharpness", 5, fillcolor),
+            SubPolicy(0.7, "color", 7, 0.5, "translateX", 8, fillcolor),
+            SubPolicy(0.3, "equalize", 7, 0.4, "autocontrast", 8, fillcolor),
+            SubPolicy(0.4, "translateY", 3, 0.2, "sharpness", 6, fillcolor),
+            SubPolicy(0.9, "brightness", 6, 0.2, "color", 8, fillcolor),
+            SubPolicy(0.5, "solarize", 2, 0.0, "invert", 3, fillcolor),
+            SubPolicy(0.2, "equalize", 0, 0.6, "autocontrast", 0, fillcolor),
+            SubPolicy(0.2, "equalize", 8, 0.8, "equalize", 4, fillcolor),
+            SubPolicy(0.9, "color", 9, 0.6, "equalize", 6, fillcolor),
+            SubPolicy(0.8, "autocontrast", 4, 0.2, "solarize", 8, fillcolor),
+            SubPolicy(0.1, "brightness", 3, 0.7, "color", 0, fillcolor),
+            SubPolicy(0.4, "solarize", 5, 0.9, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "translateY", 9, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.9, "autocontrast", 2, 0.8, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.1, "invert", 3, fillcolor),
+            SubPolicy(0.7, "translateY", 9, 0.9, "autocontrast", 1, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment CIFAR10 Policy"
+
+
+class SVHNPolicy(object):
+    """ Randomly choose one of the best 25 Sub-policies on SVHN.
+        Example:
+        >>> policy = SVHNPolicy()
+        >>> transformed = policy(image)
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     SVHNPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.9, "shearX", 4, 0.2, "invert", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.7, "invert", 5, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.6, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 3, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "equalize", 1, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.8, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.4, "invert", 5, fillcolor),
+            SubPolicy(0.9, "shearY", 5, 0.2, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 6, 0.8, "autocontrast", 1, fillcolor),
+            SubPolicy(0.6, "equalize", 3, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.3, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "shearY", 8, 0.7, "invert", 4, fillcolor),
+            SubPolicy(0.9, "equalize", 5, 0.6, "translateY", 6, fillcolor),
+            SubPolicy(0.9, "invert", 4, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.3, "contrast", 3, 0.8, "rotate", 4, fillcolor),
+            SubPolicy(0.8, "invert", 5, 0.0, "translateY", 2, fillcolor),
+            SubPolicy(0.7, "shearY", 6, 0.4, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 0.8, "rotate", 4, fillcolor), SubPolicy(
+                0.3, "shearY", 7, 0.9, "translateX", 3, fillcolor), SubPolicy(
+                    0.1, "shearX", 6, 0.6, "invert", 5, fillcolor), SubPolicy(
+                        0.7, "solarize", 2, 0.6, "translateY", 7, fillcolor),
+            SubPolicy(0.8, "shearY", 4, 0.8, "invert", 8, fillcolor), SubPolicy(
+                0.7, "shearX", 9, 0.8, "translateY", 3, fillcolor), SubPolicy(
+                    0.8, "shearY", 5, 0.7, "autocontrast", 3, fillcolor),
+            SubPolicy(0.7, "shearX", 2, 0.1, "invert", 5, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment SVHN Policy"
+
+
+class SubPolicy(object):
+    def __init__(self,
+                 p1,
+                 operation1,
+                 magnitude_idx1,
+                 p2,
+                 operation2,
+                 magnitude_idx2,
+                 fillcolor=(128, 128, 128)):
+        ranges = {
+            "shearX": np.linspace(0, 0.3, 10),
+            "shearY": np.linspace(0, 0.3, 10),
+            "translateX": np.linspace(0, 150 / 331, 10),
+            "translateY": np.linspace(0, 150 / 331, 10),
+            "rotate": np.linspace(0, 30, 10),
+            "color": np.linspace(0.0, 0.9, 10),
+            "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int),
+            "solarize": np.linspace(256, 0, 10),
+            "contrast": np.linspace(0.0, 0.9, 10),
+            "sharpness": np.linspace(0.0, 0.9, 10),
+            "brightness": np.linspace(0.0, 0.9, 10),
+            "autocontrast": [0] * 10,
+            "equalize": [0] * 10,
+            "invert": [0] * 10
+        }
+
+        # from https://stackoverflow.com/questions/5252170/specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+
+        func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, magnitude * img.size[0] * random.choice([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude * img.size[1] * random.choice([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            # "rotate": lambda img, magnitude: img.rotate(magnitude * random.choice([-1, 1])),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(1 + magnitude * random.choice([-1, 1])),
+            "posterize": lambda img, magnitude: ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude: ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img)
+        }
+
+        self.p1 = p1
+        self.operation1 = func[operation1]
+        self.magnitude1 = ranges[operation1][magnitude_idx1]
+        self.p2 = p2
+        self.operation2 = func[operation2]
+        self.magnitude2 = ranges[operation2][magnitude_idx2]
+
+    def __call__(self, img):
+        if random.random() < self.p1:
+            img = self.operation1(img, self.magnitude1)
+        if random.random() < self.p2:
+            img = self.operation2(img, self.magnitude2)
+        return img
--- a/demo/once_for_all/utils/dist_utils.py
+++ b/demo/once_for_all/utils/dist_utils.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import paddle.fluid as fluid
+
+
+def nccl2_prepare(args, startup_prog, main_prog):
+    config = fluid.DistributeTranspilerConfig()
+    config.mode = "nccl2"
+    t = fluid.DistributeTranspiler(config=config)
+
+    envs = args.dist_env
+
+    t.transpile(
+        envs["trainer_id"],
+        trainers=','.join(envs["trainer_endpoints"]),
+        current_endpoint=envs["current_endpoint"],
+        startup_program=startup_prog,
+        program=main_prog)
+
+
+def pserver_prepare(args, train_prog, startup_prog):
+    config = fluid.DistributeTranspilerConfig()
+    config.slice_var_up = args.split_var
+    t = fluid.DistributeTranspiler(config=config)
+    envs = args.dist_env
+    training_role = envs["training_role"]
+
+    t.transpile(
+        envs["trainer_id"],
+        program=train_prog,
+        pservers=envs["pserver_endpoints"],
+        trainers=envs["num_trainers"],
+        sync_mode=not args.async_mode,
+        startup_program=startup_prog)
+    if training_role == "PSERVER":
+        pserver_program = t.get_pserver_program(envs["current_endpoint"])
+        pserver_startup_program = t.get_startup_program(
+            envs["current_endpoint"],
+            pserver_program,
+            startup_program=startup_prog)
+        return pserver_program, pserver_startup_program
+    elif training_role == "TRAINER":
+        train_program = t.get_trainer_program()
+        return train_program, startup_prog
+    else:
+        raise ValueError(
+            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+        )
+
+
+def nccl2_prepare_paddle(trainer_id, startup_prog, main_prog):
+    config = fluid.DistributeTranspilerConfig()
+    config.mode = "nccl2"
+    t = fluid.DistributeTranspiler(config=config)
+    t.transpile(
+        trainer_id,
+        trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'),
+        current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'),
+        startup_program=startup_prog,
+        program=main_prog)
+
+
+def prepare_for_multi_process(exe, build_strategy, train_prog):
+    # prepare for multi-process
+    trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0))
+    num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+    if num_trainers < 2: return
+    print("PADDLE_TRAINERS_NUM", num_trainers)
+    print("PADDLE_TRAINER_ID", trainer_id)
+    build_strategy.num_trainers = num_trainers
+    build_strategy.trainer_id = trainer_id
+    # NOTE(zcd): use multi processes to train the model,
+    # and each process use one GPU card.
+    startup_prog = fluid.Program()
+    nccl2_prepare_paddle(trainer_id, startup_prog, train_prog)
+    # the startup_prog are run two times, but it doesn't matter.
+    exe.run(startup_prog)
--- a/demo/once_for_all/utils/optimizer.py
+++ b/demo/once_for_all/utils/optimizer.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import paddle.fluid as fluid
+import paddle.fluid.layers.ops as ops
+from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
+
+
+class CosineDecayWarmup(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 step_each_epoch,
+                 epochs,
+                 warmup_epochs,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(CosineDecayWarmup, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.step_each_epoch = step_each_epoch
+        self.epochs = epochs
+        self.warmup_epochs = warmup_epochs
+
+    def step(self):
+        if (self.step_num / self.step_each_epoch) < self.warmup_epochs:
+            decayed_lr = self.learning_rate * (self.step_num / (
+                self.step_each_epoch * self.warmup_epochs))
+        else:
+            decayed_lr = self.learning_rate * 0.5 * (fluid.layers.cos(
+                self.create_lr_var((self.step_num - self.warmup_epochs *
+                                    self.step_each_epoch) * math.pi /
+                                   (self.epochs * self.step_each_epoch))) + 1)
+        return decayed_lr
+
+
+class Optimizer(object):
+    """A class used to represent several optimizer methods
+    Attributes:
+        batch_size: batch size on all devices.
+        lr: learning rate.
+        lr_strategy: learning rate decay strategy.
+        l2_decay: l2_decay parameter.
+        momentum_rate: momentum rate when using Momentum optimizer.
+        step_epochs: piecewise decay steps.
+        num_epochs: number of total epochs.
+        total_images: total images.
+        step: total steps in the an epoch.
+        
+    """
+
+    def __init__(self, args, parameter_list):
+        self.parameter_list = parameter_list
+        self.batch_size = args.batch_size
+        self.lr = args.lr
+        self.lr_strategy = args.lr_strategy
+        self.l2_decay = args.l2_decay
+        self.momentum_rate = args.momentum_rate
+        self.step_epochs = args.step_epochs
+        self.num_epochs = args.num_epochs
+        self.warm_up_epochs = args.warm_up_epochs
+        self.decay_epochs = args.decay_epochs
+        self.decay_rate = args.decay_rate
+        self.total_images = args.total_images
+
+        self.step = int(math.ceil(float(self.total_images) / self.batch_size))
+
+    def piecewise_decay(self):
+        """piecewise decay with Momentum optimizer
+            Returns:
+            a piecewise_decay optimizer
+        """
+        bd = [self.step * e for e in self.step_epochs]
+        lr = [self.lr * (0.1**i) for i in range(len(bd) + 1)]
+        learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            parameter_list=self.parameter_list)
+        return optimizer
+
+    def cosine_decay(self):
+        """cosine decay with Momentum optimizer
+        Returns:
+            a cosine_decay optimizer
+        """
+
+        learning_rate = fluid.layers.cosine_decay(
+            learning_rate=self.lr,
+            step_each_epoch=self.step,
+            epochs=self.num_epochs)
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            parameter_list=self.parameter_list)
+        return optimizer
+
+    def cosine_decay_warmup(self):
+        """cosine decay with warmup
+        Returns:
+            a cosine_decay_with_warmup optimizer
+        """
+
+        learning_rate = CosineDecayWarmup(
+            learning_rate=self.lr,
+            step_each_epoch=self.step,
+            epochs=self.num_epochs,
+            warmup_epochs=5)
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            parameter_list=self.parameter_list)
+        return optimizer
+
+    def linear_decay(self):
+        """linear decay with Momentum optimizer
+        Returns:
+            a linear_decay optimizer
+        """
+
+        end_lr = 0
+        learning_rate = fluid.layers.polynomial_decay(
+            self.lr, self.step, end_lr, power=1)
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            parameter_list=self.parameter_list)
+
+        return optimizer
+
+    def adam_decay(self):
+        """Adam optimizer
+        Returns: 
+            an adam_decay optimizer
+        """
+
+        return fluid.optimizer.Adam(
+            learning_rate=self.lr, parameter_list=self.parameter_list)
+
+    def cosine_decay_RMSProp(self):
+        """cosine decay with RMSProp optimizer
+        Returns: 
+            an cosine_decay_RMSProp optimizer
+        """
+
+        learning_rate = fluid.layers.cosine_decay(
+            learning_rate=self.lr,
+            step_each_epoch=self.step,
+            epochs=self.num_epochs)
+        optimizer = fluid.optimizer.RMSProp(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            # Apply epsilon=1 on ImageNet dataset.
+            epsilon=1,
+            parameter_list=self.parameter_list)
+        return optimizer
+
+    def default_decay(self):
+        """default decay
+        Returns:
+            default decay optimizer
+        """
+
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=self.lr,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            parameter_list=self.parameter_list)
+        return optimizer
+
+
+def create_optimizer(args, parameter_list):
+    Opt = Optimizer(args, parameter_list)
+    optimizer = getattr(Opt, args.lr_strategy)()
+
+    return optimizer
--- a/demo/once_for_all/utils/utility.py
+++ b/demo/once_for_all/utils/utility.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import distutils.util
+import numpy as np
+import six
+import argparse
+import functools
+import logging
+import sys
+import os
+import warnings
+import signal
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
+from paddle.fluid.framework import Program, program_guard, name_scope, default_main_program
+from paddle.fluid import unique_name, layers
+from utils import dist_utils
+
+
+def print_arguments(args):
+    """Print argparse's arguments.
+    Usage:
+    .. code-block:: python
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-------------  Configuration Arguments -------------")
+    for arg, value in sorted(six.iteritems(vars(args))):
+        print("%25s : %s" % (arg, value))
+    print("----------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument. 
+    Usage:
+    .. code-block:: python
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
+
+
+def parse_args():
+    """Add arguments
+    Returns: 
+        all training args
+    """
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+    # yapf: disable
+
+    add_arg('use_data_parallel',                  bool,   False,                   "The flag indicating whether to use data parallel mode to train the model.")
+    add_arg('ce',                  bool,   False,                   "run ce.")
+
+    # ENV
+    add_arg('use_gpu',                  bool,   True,                   "Whether to use GPU.")
+    add_arg('model_save_dir',           str,    "./output",        "The directory path to save model.")
+    add_arg('data_dir',                 str,    "./data/ILSVRC2012/",   "The ImageNet dataset root directory.")
+    #add_arg('data_dir',                 str,    "../../PaddleCV/image_classification/data/",   "The ImageNet dataset root directory.")
+    add_arg('pretrained_model',         str,    None,                   "Whether to load pretrained model.")
+    add_arg('checkpoint',               str,    './output.ofa.mul/_ofa_epoch60',                   "Whether to resume checkpoint.")
+    add_arg('print_step',               int,    10,                     "The steps interval to print logs")
+    add_arg('save_step',                int,    1,                      "The steps interval to save checkpoints")
+
+    # SOLVER AND HYPERPARAMETERS
+    add_arg('model',                    str,    "once_for_all_kernel",   "The name of network.")
+    add_arg('total_images',             int,    1281167,                "The number of total training images.")
+    add_arg('num_epochs',               int,    360,                    "The number of total epochs.")
+    add_arg('class_dim',                int,    1000,                   "The number of total classes.")
+    add_arg('image_shape',              str,    "3,224,224",            "The size of Input image, order: [channels, height, weidth] ")
+    add_arg('batch_size',               int,    512,                      "Minibatch size on a device.")
+    add_arg('test_batch_size',          int,    512,                     "Test batch size on a deveice.")
+    add_arg('lr',                       float,  0.4,                    "The learning rate.")
+    add_arg('lr_strategy',              str,    "cosine_decay",      "The learning rate decay strategy.")
+    add_arg('l2_decay',                 float,  1e-4,                   "The l2_decay parameter.")
+    add_arg('momentum_rate',            float,  0.9,                    "The value of momentum_rate.")
+    add_arg('warm_up_epochs',           float,  5.0,                    "The value of warm up epochs")
+    add_arg('decay_epochs',             float,  2.4,                    "Decay epochs of exponential decay learning rate scheduler")
+    add_arg('decay_rate',               float,  0.97,                   "Decay rate of exponential decay learning rate scheduler")
+    add_arg('drop_connect_rate',        float,  0.2,                    "The value of drop connect rate")
+    parser.add_argument('--step_epochs', nargs='+', type=int, default=[360], help="piecewise decay step")
+
+    # NOTE: used for benchmark
+    add_arg('max_iter',                 int,    0,                      "The number of total train max_iters.")
+
+
+    # READER AND PREPROCESS
+    add_arg('lower_scale',              float,  0.08,                   "The value of lower_scale in ramdom_crop")
+    add_arg('lower_ratio',              float,  3./4.,                  "The value of lower_ratio in ramdom_crop")
+    add_arg('upper_ratio',              float,  4./3.,                  "The value of upper_ratio in ramdom_crop")
+    add_arg('resize_short_size',        int,    256,                    "The value of resize_short_size")
+    add_arg('crop_size',                int,    224,                    "The value of crop size")
+    add_arg('use_mixup',                bool,   False,                  "Whether to use mixup")
+    add_arg('mixup_alpha',              float,  0.2,                    "The value of mixup_alpha")
+    add_arg('reader_thread',            int,    8,                      "The number of multi thread reader")
+    add_arg('reader_buf_size',          int,    16,                   "The buf size of multi thread reader")
+    add_arg('interpolation',            int,    None,                   "The interpolation mode")
+    add_arg('use_aa',                   bool,   False,                  "Whether to use auto augment")
+    parser.add_argument('--image_mean', nargs='+', type=float, default=[0.485, 0.456, 0.406], help="The mean of input image data")
+    parser.add_argument('--image_std', nargs='+', type=float, default=[0.229, 0.224, 0.225], help="The std of input image data")
+
+    # SWITCH
+    #NOTE: (2019/08/08) FP16 is moving to PaddlePaddle/Fleet now
+    #add_arg('use_fp16',                 bool,   False,                  "Whether to enable half precision training with fp16." )
+    #add_arg('scale_loss',               float,  1.0,                    "The value of scale_loss for fp16." )
+    add_arg('use_label_smoothing',      bool,   False,                  "Whether to use label_smoothing")
+    add_arg('label_smoothing_epsilon',  float,  0.1,                    "The value of label_smoothing_epsilon parameter")
+    #NOTE: (2019/08/08) temporary disable use_distill
+    #add_arg('use_distill',              bool,   False,                  "Whether to use distill")
+    add_arg('random_seed',              int,    None,                   "random seed")
+    add_arg('use_ema',                  bool,   False,                  "Whether to use ExponentialMovingAverage.")
+    add_arg('ema_decay',                float,  0.9999,                 "The value of ema decay rate")
+    add_arg('padding_type',             str,    "SAME",                 "Padding type of convolution")
+    add_arg('use_se',                   bool,   True,                   "Whether to use Squeeze-and-Excitation module for EfficientNet.")
+    # yapf: enable
+
+    args = parser.parse_args()
+
+    return args
+
+
+def check_gpu():
+    """   
+    Log error and exit when set use_gpu=true in paddlepaddle
+    cpu ver sion.
+    """
+    logger = logging.getLogger(__name__)
+    err = "Config use_gpu cannot be set as true while you are " \
+                "using paddlepaddle cpu version ! \nPlease try: \n" \
+                "\t1. Install paddlepaddle-gpu to run model on GPU \n" \
+                "\t2. Set use_gpu as false in config file to run " \
+                "model on CPU"
+
+    try:
+        if args.use_gpu and not fluid.is_compiled_with_cuda():
+            print(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
+
+
+def check_version():
+    """
+    Log error and exit when the installed version of paddlepaddle is
+    not satisfied.
+    """
+    err = "PaddlePaddle version 1.6 or higher is required, " \
+          "or a suitable develop version is satisfied as well. \n" \
+          "Please make sure the version is good with your code." \
+
+    try:
+        fluid.require_version('1.6.0')
+    except Exception as e:
+        print(err)
+        sys.exit(1)
+
+
+def check_args(args):
+    """check arguments before running
+    Args:
+        all arguments
+    """
+
+    # check models name
+    sys.path.append("..")
+    import models
+    model_list = [m for m in dir(models) if "__" not in m]
+    assert args.model in model_list, "{} is not in lists: {}, please check the model name".format(
+        args.model, model_list)
+
+    # check learning rate strategy
+    lr_strategy_list = [
+        "piecewise_decay", "cosine_decay", "linear_decay",
+        "cosine_decay_warmup", "exponential_decay_warmup"
+    ]
+    if args.lr_strategy not in lr_strategy_list:
+        warnings.warn(
+            "\n{} is not in lists: {}, \nUse default learning strategy now.".
+            format(args.lr_strategy, lr_strategy_list))
+        args.lr_strategy = "default_decay"
+    # check confict of GoogLeNet and mixup
+    if args.model == "GoogLeNet":
+        assert args.use_mixup == False, "Cannot use mixup processing in GoogLeNet, please set use_mixup = False."
+
+    if args.interpolation:
+        assert args.interpolation in [
+            0, 1, 2, 3, 4
+        ], "Wrong interpolation, please set:\n0: cv2.INTER_NEAREST\n1: cv2.INTER_LINEAR\n2: cv2.INTER_CUBIC\n3: cv2.INTER_AREA\n4: cv2.INTER_LANCZOS4"
+
+    if args.padding_type:
+        assert args.padding_type in [
+            "SAME", "VALID", "DYNAMIC"
+        ], "Wrong padding_type, please set:\nSAME\nVALID\nDYNAMIC"
+
+    assert args.checkpoint is None or args.pretrained_model is None, "Do not init model by checkpoint and pretrained_model both."
+
+    # check pretrained_model path for loading
+    if args.pretrained_model is not None:
+        assert isinstance(args.pretrained_model, str)
+        assert os.path.isdir(
+            args.
+            pretrained_model), "please support available pretrained_model path."
+
+    #FIXME: check checkpoint path for saving
+    if args.checkpoint is not None:
+        assert isinstance(args.checkpoint, str)
+        assert os.path.isdir(
+            args.checkpoint
+        ), "please support available checkpoint path for initing model."
+
+    # check params for loading
+    """
+    if args.save_params:
+        assert isinstance(args.save_params, str)
+        assert os.path.isdir(
+            args.save_params), "please support available save_params path."
+    """
+
+    # check gpu: when using gpu, the number of visible cards should divide batch size
+    if args.use_gpu:
+        assert args.batch_size % fluid.core.get_cuda_device_count(
+        ) == 0, "please support correct batch_size({}), which can be divided by available cards({}), you can change the number of cards by indicating: export CUDA_VISIBLE_DEVICES= ".format(
+            args.batch_size, fluid.core.get_cuda_device_count())
+
+    # check data directory
+    assert os.path.isdir(
+        args.data_dir
+    ), "Data doesn't exist in {}, please load right path".format(args.data_dir)
+
+    #check gpu
+
+    check_gpu()
+    check_version()
+
+
+def init_model(exe, args, program):
+    if args.checkpoint:
+        fluid.io.load_persistables(exe, args.checkpoint, main_program=program)
+        print("Finish initing model from %s" % (args.checkpoint))
+
+    if args.pretrained_model:
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(args.pretrained_model, var.name))
+
+        fluid.io.load_vars(
+            exe,
+            args.pretrained_model,
+            main_program=program,
+            predicate=if_exist)
+
+
+def save_model(args, exe, train_prog, info):
+    model_path = os.path.join(args.model_save_dir, args.model, str(info))
+    if not os.path.isdir(model_path):
+        os.makedirs(model_path)
+    fluid.io.save_persistables(exe, model_path, main_program=train_prog)
+    print("Already save model in %s" % (model_path))
+
+
+def create_data_loader(is_train, args):
+    """create data_loader
+    Usage:
+        Using mixup process in training, it will return 5 results, include data_loader, image, y_a(label), y_b(label) and lamda, or it will return 3 results, include data_loader, image, and label.
+    Args: 
+        is_train: mode
+        args: arguments
+    Returns:
+        data_loader and the input data of net, 
+    """
+    image_shape = [int(m) for m in args.image_shape.split(",")]
+
+    feed_image = fluid.data(
+        name="feed_image",
+        shape=[None] + image_shape,
+        dtype="float32",
+        lod_level=0)
+
+    feed_label = fluid.data(
+        name="feed_label", shape=[None, 1], dtype="int64", lod_level=0)
+    feed_y_a = fluid.data(
+        name="feed_y_a", shape=[None, 1], dtype="int64", lod_level=0)
+
+    if is_train and args.use_mixup:
+        feed_y_b = fluid.data(
+            name="feed_y_b", shape=[None, 1], dtype="int64", lod_level=0)
+        feed_lam = fluid.data(
+            name="feed_lam", shape=[None, 1], dtype="float32", lod_level=0)
+
+        data_loader = fluid.io.DataLoader.from_generator(
+            capacity=64,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True)
+
+        return data_loader, [feed_image, feed_y_a, feed_y_b, feed_lam]
+    else:
+        data_loader = fluid.io.DataLoader.from_generator(
+            capacity=64,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True)
+
+        return data_loader, [feed_image, feed_label]
+
+
+def print_info(pass_id, batch_id, print_step, metrics, time_info, info_mode):
+    """print function
+    Args:
+        pass_id: epoch index
+        batch_id: batch index
+        print_step: the print_step arguments
+        metrics: message to print
+        time_info: time infomation
+        info_mode: mode
+    """
+    if info_mode == "batch":
+        if batch_id % print_step == 0:
+            #if isinstance(metrics,np.ndarray):
+            # train and mixup output
+            if len(metrics) == 2:
+                loss, lr = metrics
+                print(
+                    "[Pass {0}, train batch {1}] \tloss {2}, lr {3}, elapse {4}".
+                    format(pass_id, batch_id, "%.5f" % loss, "%.5f" % lr,
+                           "%2.4f sec" % time_info))
+            # train and no mixup output
+            elif len(metrics) == 4:
+                loss, acc1, acc5, lr = metrics
+                print(
+                    "[Pass {0}, train batch {1}] \tloss {2}, acc1 {3}, acc5 {4}, lr {5}, elapse {6}".
+                    format(pass_id, batch_id, "%.5f" % loss, "%.5f" % acc1,
+                           "%.5f" % acc5, "%.5f" % lr, "%2.4f sec" % time_info))
+            # test output
+            elif len(metrics) == 3:
+                loss, acc1, acc5 = metrics
+                print(
+                    "[Pass {0}, test  batch {1}] \tloss {2}, acc1 {3}, acc5 {4}, elapse {5}".
+                    format(pass_id, batch_id, "%.5f" % loss, "%.5f" % acc1,
+                           "%.5f" % acc5, "%2.4f sec" % time_info))
+            else:
+                raise Exception(
+                    "length of metrics {} is not implemented, It maybe caused by wrong format of build_program_output".
+                    format(len(metrics)))
+            sys.stdout.flush()
+
+    elif info_mode == "epoch":
+        ## TODO add time elapse
+        #if isinstance(metrics,np.ndarray):
+        if len(metrics) == 5:
+            train_loss, _, test_loss, test_acc1, test_acc5 = metrics
+            print(
+                "[End pass {0}]\ttrain_loss {1}, test_loss {2}, test_acc1 {3}, test_acc5 {4}".
+                format(pass_id, "%.5f" % train_loss, "%.5f" % test_loss, "%.5f"
+                       % test_acc1, "%.5f" % test_acc5))
+        elif len(metrics) == 7:
+            train_loss, train_acc1, train_acc5, _, test_loss, test_acc1, test_acc5 = metrics
+            print(
+                "[End pass {0}]\ttrain_loss {1}, train_acc1 {2}, train_acc5 {3},test_loss {4}, test_acc1 {5}, test_acc5 {6}".
+                format(pass_id, "%.5f" % train_loss, "%.5f" % train_acc1, "%.5f"
+                       % train_acc5, "%.5f" % test_loss, "%.5f" % test_acc1,
+                       "%.5f" % test_acc5))
+        sys.stdout.flush()
+    elif info_mode == "ce":
+        raise Warning("CE code is not ready")
+    else:
+        raise Exception("Illegal info_mode")
+
+
+def best_strategy_compiled(args, program, loss, exe):
+    """make a program which wrapped by a compiled program
+    """
+
+    if os.getenv('FLAGS_use_ngraph'):
+        return program
+    else:
+        build_strategy = fluid.compiler.BuildStrategy()
+        #Feature will be supported in Fluid v1.6
+        #build_strategy.enable_inplace = True
+
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.num_threads = fluid.core.get_cuda_device_count()
+        exec_strategy.num_iteration_per_drop_scope = 10
+
+        num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+        if num_trainers > 1 and args.use_gpu:
+            dist_utils.prepare_for_multi_process(exe, build_strategy, program)
+            # NOTE: the process is fast when num_threads is 1
+            # for multi-process training.
+            exec_strategy.num_threads = 1
+
+        compiled_program = fluid.CompiledProgram(program).with_data_parallel(
+            loss_name=loss.name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
+
+        return compiled_program
+
+
+class ExponentialMovingAverage(object):
+    def __init__(self,
+                 decay=0.999,
+                 thres_steps=None,
+                 zero_debias=False,
+                 name=None):
+        self._decay = decay
+        self._thres_steps = thres_steps
+        self._name = name if name is not None else ''
+        self._decay_var = self._get_ema_decay()
+
+        self._params_tmps = []
+        for param in default_main_program().global_block().all_parameters():
+            if param.do_model_average != False:
+                tmp = param.block.create_var(
+                    name=unique_name.generate(".".join(
+                        [self._name + param.name, 'ema_tmp'])),
+                    dtype=param.dtype,
+                    persistable=False,
+                    stop_gradient=True)
+                self._params_tmps.append((param, tmp))
+
+        self._ema_vars = {}
+        for param, tmp in self._params_tmps:
+            with param.block.program._optimized_guard(
+                [param, tmp]), name_scope('moving_average'):
+                self._ema_vars[param.name] = self._create_ema_vars(param)
+
+        self.apply_program = Program()
+        block = self.apply_program.global_block()
+        with program_guard(main_program=self.apply_program):
+            decay_pow = self._get_decay_pow(block)
+            for param, tmp in self._params_tmps:
+                param = block._clone_variable(param)
+                tmp = block._clone_variable(tmp)
+                ema = block._clone_variable(self._ema_vars[param.name])
+                layers.assign(input=param, output=tmp)
+                # bias correction
+                if zero_debias:
+                    ema = ema / (1.0 - decay_pow)
+                layers.assign(input=ema, output=param)
+
+        self.restore_program = Program()
+        block = self.restore_program.global_block()
+        with program_guard(main_program=self.restore_program):
+            for param, tmp in self._params_tmps:
+                tmp = block._clone_variable(tmp)
+                param = block._clone_variable(param)
+                layers.assign(input=tmp, output=param)
+
+    def _get_ema_decay(self):
+        with default_main_program()._lr_schedule_guard():
+            decay_var = layers.tensor.create_global_var(
+                shape=[1],
+                value=self._decay,
+                dtype='float32',
+                persistable=True,
+                name="scheduled_ema_decay_rate")
+
+            if self._thres_steps is not None:
+                decay_t = (self._thres_steps + 1.0) / (self._thres_steps + 10.0)
+                with layers.control_flow.Switch() as switch:
+                    with switch.case(decay_t < self._decay):
+                        layers.tensor.assign(decay_t, decay_var)
+                    with switch.default():
+                        layers.tensor.assign(
+                            np.array(
+                                [self._decay], dtype=np.float32),
+                            decay_var)
+        return decay_var
+
+    def _get_decay_pow(self, block):
+        global_steps = layers.learning_rate_scheduler._decay_step_counter()
+        decay_var = block._clone_variable(self._decay_var)
+        decay_pow_acc = layers.elementwise_pow(decay_var, global_steps + 1)
+        return decay_pow_acc
+
+    def _create_ema_vars(self, param):
+        param_ema = layers.create_global_var(
+            name=unique_name.generate(self._name + param.name + '_ema'),
+            shape=param.shape,
+            value=0.0,
+            dtype=param.dtype,
+            persistable=True)
+
+        return param_ema
+
+    def update(self):
+        """
+        Update Exponential Moving Average. Should only call this method in
+        train program.
+        """
+        param_master_emas = []
+        for param, tmp in self._params_tmps:
+            with param.block.program._optimized_guard(
+                [param, tmp]), name_scope('moving_average'):
+                param_ema = self._ema_vars[param.name]
+                if param.name + '.master' in self._ema_vars:
+                    master_ema = self._ema_vars[param.name + '.master']
+                    param_master_emas.append([param_ema, master_ema])
+                else:
+                    ema_t = param_ema * self._decay_var + param * (
+                        1 - self._decay_var)
+                    layers.assign(input=ema_t, output=param_ema)
+
+        # for fp16 params
+        for param_ema, master_ema in param_master_emas:
+            default_main_program().global_block().append_op(
+                type="cast",
+                inputs={"X": master_ema},
+                outputs={"Out": param_ema},
+                attrs={
+                    "in_dtype": master_ema.dtype,
+                    "out_dtype": param_ema.dtype
+                })
+
+    @signature_safe_contextmanager
+    def apply(self, executor, need_restore=True):
+        """
+        Apply moving average to parameters for evaluation.
+        Args:
+            executor (Executor): The Executor to execute applying.
+            need_restore (bool): Whether to restore parameters after applying.
+        """
+        executor.run(self.apply_program)
+        try:
+            yield
+        finally:
+            if need_restore:
+                self.restore(executor)
+
+    def restore(self, executor):
+        """Restore parameters.
+        Args:
+            executor (Executor): The Executor to execute restoring.
+        """
+        executor.run(self.restore_program)