add ofa demo

e830dfa9 · xiteng1988 · a596e2b6 · e830dfa9 · e830dfa9 · e830dfa9
14 changed file
--- a/demo/once_for_all/dy_models/__init__.py
+++ b/demo/once_for_all/dy_models/__init__.py
+from .once_for_all_kernel import OFA_kernel
+from .mobilenet_v1 import MobileNetV1
+from .mobilenet_v2 import MobileNetV2
+from .mobilenet_v3 import MobileNetV3_dy
--- a/demo/once_for_all/dy_models/func_conv.py
+++ b/demo/once_for_all/dy_models/func_conv.py
--- a/demo/once_for_all/dy_models/mobilenet_v1.py
+++ b/demo/once_for_all/dy_models/mobilenet_v1.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import os
+import time
+import sys
+import math
+import numpy as np
+import argparse
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid import framework
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 act='relu',
+                 use_cudnn=True,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=self.full_name() + "_weights"),
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
+            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
+            moving_mean_name=self.full_name() + "_bn" + '_mean',
+            moving_variance_name=self.full_name() + "_bn" + '_variance')
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+class DepthwiseSeparable(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters1,
+                 num_filters2,
+                 num_groups,
+                 stride,
+                 scale,
+                 name=None):
+        super(DepthwiseSeparable, self).__init__()
+        self._depthwise_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=int(num_filters1 * scale),
+            filter_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            use_cudnn=False)
+        self._pointwise_conv = ConvBNLayer(
+            num_channels=int(num_filters1 * scale),
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+    def forward(self, inputs):
+        y = self._depthwise_conv(inputs)
+        y = self._pointwise_conv(y)
+        return y
+class MobileNetV1(fluid.dygraph.Layer):
+    def __init__(self, scale=1.0, class_dim=1000):
+        super(MobileNetV1, self).__init__()
+        self.scale = scale
+        self.dwsl = []
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1)
+        dws21 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(32 * scale),
+                num_filters1=32,
+                num_filters2=64,
+                num_groups=32,
+                stride=1,
+                scale=scale),
+            name="conv2_1")
+        self.dwsl.append(dws21)
+        dws22 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(64 * scale),
+                num_filters1=64,
+                num_filters2=128,
+                num_groups=64,
+                stride=2,
+                scale=scale),
+            name="conv2_2")
+        self.dwsl.append(dws22)
+        dws31 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(128 * scale),
+                num_filters1=128,
+                num_filters2=128,
+                num_groups=128,
+                stride=1,
+                scale=scale),
+            name="conv3_1")
+        self.dwsl.append(dws31)
+        dws32 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(128 * scale),
+                num_filters1=128,
+                num_filters2=256,
+                num_groups=128,
+                stride=2,
+                scale=scale),
+            name="conv3_2")
+        self.dwsl.append(dws32)
+        dws41 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(256 * scale),
+                num_filters1=256,
+                num_filters2=256,
+                num_groups=256,
+                stride=1,
+                scale=scale),
+            name="conv4_1")
+        self.dwsl.append(dws41)
+        dws42 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(256 * scale),
+                num_filters1=256,
+                num_filters2=512,
+                num_groups=256,
+                stride=2,
+                scale=scale),
+            name="conv4_2")
+        self.dwsl.append(dws42)
+        for i in range(5):
+            tmp = self.add_sublayer(
+                sublayer=DepthwiseSeparable(
+                    num_channels=int(512 * scale),
+                    num_filters1=512,
+                    num_filters2=512,
+                    num_groups=512,
+                    stride=1,
+                    scale=scale),
+                name="conv5_" + str(i + 1))
+            self.dwsl.append(tmp)
+        dws56 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(512 * scale),
+                num_filters1=512,
+                num_filters2=1024,
+                num_groups=512,
+                stride=2,
+                scale=scale),
+            name="conv5_6")
+        self.dwsl.append(dws56)
+        dws6 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(1024 * scale),
+                num_filters1=1024,
+                num_filters2=1024,
+                num_groups=1024,
+                stride=1,
+                scale=scale),
+            name="conv6")
+        self.dwsl.append(dws6)
+        self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+        self.out = Linear(
+            int(1024 * scale),
+            class_dim,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=self.full_name() + "fc7_weights"),
+            bias_attr=ParamAttr(name="fc7_offset"))
+    def forward(self, inputs):
+        y = self.conv1(inputs)
+        for dws in self.dwsl:
+            y = dws(y)
+        y = self.pool2d_avg(y)
+        y = fluid.layers.reshape(y, shape=[-1, 1024])
+        y = self.out(y)
+        return y
--- a/demo/once_for_all/dy_models/mobilenet_v2.py
+++ b/demo/once_for_all/dy_models/mobilenet_v2.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import os
+import time
+import math
+import sys
+import numpy as np
+import argparse
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid import framework
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 use_cudnn=True):
+        super(ConvBNLayer, self).__init__()
+        tmp_param = ParamAttr(name=self.full_name() + "_weights")
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=tmp_param,
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
+            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
+            moving_mean_name=self.full_name() + "_bn" + '_mean',
+            moving_variance_name=self.full_name() + "_bn" + '_variance')
+    def forward(self, inputs, if_act=True):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if if_act:
+            y = fluid.layers.relu6(y)
+        return y
+class InvertedResidualUnit(fluid.dygraph.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_in_filter,
+            num_filters,
+            stride,
+            filter_size,
+            padding,
+            expansion_factor, ):
+        super(InvertedResidualUnit, self).__init__()
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+        self._expand_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_expfilter,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1)
+        self._bottleneck_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_expfilter,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            num_groups=num_expfilter,
+            use_cudnn=False)
+        self._linear_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_filters,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1)
+    def forward(self, inputs, ifshortcut):
+        y = self._expand_conv(inputs, if_act=True)
+        y = self._bottleneck_conv(y, if_act=True)
+        y = self._linear_conv(y, if_act=False)
+        if ifshortcut:
+            y = fluid.layers.elementwise_add(inputs, y)
+        return y
+class InvresiBlocks(fluid.dygraph.Layer):
+    def __init__(self, in_c, t, c, n, s):
+        super(InvresiBlocks, self).__init__()
+        self._first_block = InvertedResidualUnit(
+            num_channels=in_c,
+            num_in_filter=in_c,
+            num_filters=c,
+            stride=s,
+            filter_size=3,
+            padding=1,
+            expansion_factor=t)
+        self._inv_blocks = []
+        for i in range(1, n):
+            tmp = self.add_sublayer(
+                sublayer=InvertedResidualUnit(
+                    num_channels=c,
+                    num_in_filter=c,
+                    num_filters=c,
+                    stride=1,
+                    filter_size=3,
+                    padding=1,
+                    expansion_factor=t),
+                name=self.full_name() + "_" + str(i + 1))
+            self._inv_blocks.append(tmp)
+    def forward(self, inputs):
+        y = self._first_block(inputs, ifshortcut=False)
+        for inv_block in self._inv_blocks:
+            y = inv_block(y, ifshortcut=True)
+        return y
+class MobileNetV2(fluid.dygraph.Layer):
+    def __init__(self, class_dim=1000, scale=1.0):
+        super(MobileNetV2, self).__init__()
+        self.scale = scale
+        self.class_dim = class_dim
+        bottleneck_params_list = [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),
+            (6, 32, 3, 2),
+            (6, 64, 4, 2),
+            (6, 96, 3, 1),
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),
+        ]
+        #1. conv1 
+        self._conv1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=int(32 * scale),
+            filter_size=3,
+            stride=2,
+            padding=1)
+        #2. bottleneck sequences
+        self._invl = []
+        i = 1
+        in_c = int(32 * scale)
+        for layer_setting in bottleneck_params_list:
+            t, c, n, s = layer_setting
+            i += 1
+            tmp = self.add_sublayer(
+                sublayer=InvresiBlocks(
+                    in_c=in_c, t=t, c=int(c * scale), n=n, s=s),
+                name='conv' + str(i))
+            self._invl.append(tmp)
+            in_c = int(c * scale)
+        #3. last_conv
+        self._out_c = int(1280 * scale) if scale > 1.0 else 1280
+        self._conv9 = ConvBNLayer(
+            num_channels=in_c,
+            num_filters=self._out_c,
+            filter_size=1,
+            stride=1,
+            padding=0)
+        #4. pool
+        self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+        #5. fc
+        tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
+        self._fc = Linear(
+            self._out_c,
+            class_dim,
+            param_attr=tmp_param,
+            bias_attr=ParamAttr(name="fc10_offset"))
+    def forward(self, inputs):
+        y = self._conv1(inputs, if_act=True)
+        for inv in self._invl:
+            y = inv(y)
+        y = self._conv9(y, if_act=True)
+        y = self._pool2d_avg(y)
+        y = fluid.layers.reshape(y, shape=[-1, self._out_c])
+        y = self._fc(y)
+        return y
--- a/demo/once_for_all/dy_models/mobilenet_v3.py
+++ b/demo/once_for_all/dy_models/mobilenet_v3.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+class MobileNetV3_dy(fluid.dygraph.Layer):
+    def __init__(self, scale=1.0, model_name='small', class_dim=1000):
+        super(MobileNetV3_dy, self).__init__()
+        inplanes = 16
+        if model_name == "large":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, 'relu', 1],
+                [3, 64, 24, False, 'relu', 2],
+                [3, 72, 24, False, 'relu', 1],
+                [5, 72, 40, True, 'relu', 2],
+                [5, 120, 40, True, 'relu', 1],
+                [5, 120, 40, True, 'relu', 1],
+                [3, 240, 80, False, 'hard_swish', 2],
+                [3, 200, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 480, 112, True, 'hard_swish', 1],
+                [3, 672, 112, True, 'hard_swish', 1],
+                [5, 672, 160, True, 'hard_swish', 2],
+                [5, 960, 160, True, 'hard_swish', 1],
+                [5, 960, 160, True, 'hard_swish', 1],
+            ]
+            self.cls_ch_squeeze = 960
+            self.cls_ch_expand = 1280
+        elif model_name == "small":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, 'relu', 2],
+                [3, 72, 24, False, 'relu', 2],
+                [3, 88, 24, False, 'relu', 1],
+                [5, 96, 40, True, 'hard_swish', 2],
+                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 120, 48, True, 'hard_swish', 1],
+                [5, 144, 48, True, 'hard_swish', 1],
+                [5, 288, 96, True, 'hard_swish', 2],
+                [5, 576, 96, True, 'hard_swish', 1],
+                [5, 576, 96, True, 'hard_swish', 1],
+            ]
+            self.cls_ch_squeeze = 576
+            self.cls_ch_expand = 1280
+        else:
+            raise NotImplementedError("mode[" + model_name +
+                                      "_model] is not implemented!")
+        # network
+        # conv1
+        self.conv1 = ConvBnLayer(
+            in_c=3,
+            out_c=make_divisible(inplanes * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            name=self.full_name() + 'conv1')
+        # conv_blocks
+        self.all_blocks = []
+        i = 0
+        inplanes = make_divisible(inplanes * scale)
+        for (k, exp, c, se, nl, s) in self.cfg:
+            self.all_blocks.append(
+                ResidualUnit(
+                    in_c=inplanes,
+                    mid_c=make_divisible(scale * exp),
+                    out_c=make_divisible(scale * c),
+                    filter_size=k,
+                    stride=s,
+                    ues_se=se,
+                    act=nl,
+                    name=self.full_name() + 'conv' + str(i + 2)))
+            self.add_sublayer(
+                sublayer=self.all_blocks[-1],
+                name=self.full_name() + 'conv' + str(i + 2))
+            inplanes = make_divisible(scale * c)
+            i += 1
+        # last_second_conv
+        self.last_second_conv = ConvBnLayer(
+            in_c=inplanes,
+            out_c=make_divisible(scale * self.cls_ch_squeeze),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            name=self.full_name() + 'last_second_conv')
+        # global_avg_pool
+        self.pool = fluid.dygraph.Pool2D(
+            pool_type='avg', global_pooling=True, use_cudnn=False)
+        # last_conv
+        self.last_conv = fluid.dygraph.Conv2D(
+            num_channels=make_divisible(scale * self.cls_ch_squeeze),
+            num_filters=self.cls_ch_expand,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            param_attr=ParamAttr(
+                name=self.full_name() + 'last_1x1_conv_weights'),
+            bias_attr=False)
+        # fc
+        self.fc = fluid.dygraph.Linear(
+            input_dim=self.cls_ch_expand,
+            output_dim=class_dim,
+            param_attr=ParamAttr(name=self.full_name() + 'fc_weights'))
+    def forward(self, inputs, label=None, dropout_prob=0.2):
+        x = self.conv1(inputs)
+        for i in range(len(self.all_blocks)):
+            x = self.all_blocks[i](x)
+        x = self.last_second_conv(x)
+        x = self.pool(x)
+        x = self.last_conv(x)
+        x = fluid.layers.hard_swish(x)
+        x = fluid.layers.dropout(x=x, dropout_prob=dropout_prob)
+        x = fluid.layers.reshape(x, shape=[x.shape[0], x.shape[1]])
+        x = self.fc(x)
+        if label:
+            acc1 = fluid.layers.accuracy(input=x, label=label)
+            acc5 = fluid.layers.accuracy(input=x, label=label, k=5)
+            return x, acc1, acc5
+        return x
+class ConvBnLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 if_act=True,
+                 act=None,
+                 use_cudnn=True,
+                 name=''):
+        super(ConvBnLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.conv = fluid.dygraph.Conv2D(
+            num_channels=in_c,
+            num_filters=out_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            use_cudnn=use_cudnn,
+            act=None)
+        self.bn = fluid.dygraph.BatchNorm(
+            num_channels=out_c,
+            act=None,
+            param_attr=ParamAttr(
+                name=name + "_bn" + "_scale",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0)),
+            bias_attr=ParamAttr(
+                name=name + "_bn" + "_offset",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0)),
+            moving_mean_name=name + "_bn" + '_mean',
+            moving_variance_name=name + "_bn" + '_variance')
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            if self.act == 'relu':
+                x = fluid.layers.relu(x)
+            elif self.act == 'hard_swish':
+                x = fluid.layers.hard_swish(x)
+            else:
+                print('The activation function is selected incorrectly.')
+                exit()
+        return x
+class ResidualUnit(fluid.dygraph.Layer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 ues_se,
+                 act=None,
+                 name=''):
+        super(ResidualUnit, self).__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.if_se = ues_se
+        self.expand_conv = ConvBnLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act,
+            name=name + '_expand')
+        self.bottleneck_conv = ConvBnLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            num_groups=mid_c,
+            if_act=True,
+            act=act,
+            name=name + '_depthwise')
+        if self.if_se:
+            self.mid_se = SEModule(mid_c, name=name + '_SE')
+        self.linear_conv = ConvBnLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None,
+            name=name + '_linear')
+    def forward(self, inputs):
+        x = self.expand_conv(inputs)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = fluid.layers.elementwise_add(inputs, x)
+        return x
+class SEModule(fluid.dygraph.Layer):
+    def __init__(self, channel, reduction=4, name=''):
+        super(SEModule, self).__init__()
+        self.avg_pool = fluid.dygraph.Pool2D(
+            pool_type='avg', global_pooling=True, use_cudnn=False)
+        self.conv1 = fluid.dygraph.Conv2D(
+            num_channels=channel,
+            num_filters=channel // reduction,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act='relu',
+            param_attr=ParamAttr(name=name + "_weights1"),
+            bias_attr=ParamAttr(name=name + "_bias1"))
+        self.conv2 = fluid.dygraph.Conv2D(
+            num_channels=channel // reduction,
+            num_filters=channel,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights2"),
+            bias_attr=ParamAttr(name=name + "_bias2"))
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = self.conv2(outputs)
+        outputs = fluid.layers.hard_sigmoid(outputs, slope=0.2, offset=0.5)
+        return fluid.layers.elementwise_mul(x=inputs, y=outputs, axis=0)
+if __name__ == "__main__":
+    import numpy as np
+    place = fluid.CPUPlace()
+    with fluid.dygraph.guard(place):
+        model = MobileNetV3_dy(scale=0.6, model_name='large', class_dim=1000)
+        img = np.random.uniform(0, 255, [8, 3, 224, 224]).astype('float32')
+        img = fluid.dygraph.to_variable(img)
+        res = model(img)
+        print(res.shape)
+    #     out_dygraph, static_layer = fluid.dygraph.TracedLayer.trace(model, inputs=[img])
+    #     out_static_graph = static_layer([img])
+    #     print(len(out_static_graph)) # 1
+    #     print(out_static_graph[0].shape) 
+    #     static_layer.save_inference_model(dirname='./saved_infer_model')
+    # from calc_flops import *
+    # place = fluid.CPUPlace()
+    # exe = fluid.Executor(place)
+    # program, feed_vars, fetch_vars = fluid.io.load_inference_model('./saved_infer_model', exe)
+    # total_flops_params, is_quantize = summary(program)
--- a/demo/once_for_all/dy_models/once_for_all_kernel.py
+++ b/demo/once_for_all/dy_models/once_for_all_kernel.py
--- a/demo/once_for_all/reader.py
+++ b/demo/once_for_all/reader.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import sys
+import os
+import math
+import random
+import functools
+import numpy as np
+import cv2
+import paddle
+from paddle import fluid
+from utils.autoaugment import ImageNetPolicy
+from PIL import Image
+policy = None
+random.seed(0)
+np.random.seed(0)
+def rotate_image(img):
+    """rotate image
+    Args:
+        img: image data
+    Returns:
+        rotated image data
+    """
+    (h, w) = img.shape[:2]
+    center = (w / 2, h / 2)
+    angle = np.random.randint(-10, 11)
+    M = cv2.getRotationMatrix2D(center, angle, 1.0)
+    rotated = cv2.warpAffine(img, M, (w, h))
+    return rotated
+def random_crop(img, size, settings, scale=None, ratio=None,
+                interpolation=None):
+    """random crop image
+    Args:
+        img: image data
+        size: crop size
+        settings: arguments
+        scale: scale parameter
+        ratio: ratio parameter
+    Returns:
+        random cropped image data
+    """
+    lower_scale = settings.lower_scale
+    lower_ratio = settings.lower_ratio
+    upper_ratio = settings.upper_ratio
+    scale = [lower_scale, 1.0] if scale is None else scale
+    ratio = [lower_ratio, upper_ratio] if ratio is None else ratio
+    aspect_ratio = math.sqrt(np.random.uniform(*ratio))
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+    bound = min((float(img.shape[0]) / img.shape[1]) / (h**2),
+                (float(img.shape[1]) / img.shape[0]) / (w**2))
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+    target_area = img.shape[0] * img.shape[1] * np.random.uniform(scale_min,
+                                                                  scale_max)
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+    i = np.random.randint(0, img.shape[0] - h + 1)
+    j = np.random.randint(0, img.shape[1] - w + 1)
+    img = img[i:i + h, j:j + w, :]
+    if interpolation:
+        resized = cv2.resize(img, (size, size), interpolation=interpolation)
+    else:
+        resized = cv2.resize(img, (size, size))
+    return resized
+#NOTE:(2019/08/08) distort color func is not implemented
+def distort_color(img):
+    """distort image color
+    Args:
+        img: image data
+    Returns:
+        distorted color image data
+    """
+    return img
+def resize_short(img, target_size, interpolation=None):
+    """resize image
+    Args:
+        img: image data
+        target_size: resize short target size
+        interpolation: interpolation mode
+    Returns:
+        resized image data
+    """
+    percent = float(target_size) / min(img.shape[0], img.shape[1])
+    resized_width = int(round(img.shape[1] * percent))
+    resized_height = int(round(img.shape[0] * percent))
+    if interpolation:
+        resized = cv2.resize(
+            img, (resized_width, resized_height), interpolation=interpolation)
+    else:
+        resized = cv2.resize(img, (resized_width, resized_height))
+    return resized
+def crop_image(img, target_size, center):
+    """crop image 
+    Args:
+        img: images data
+        target_size: crop target size
+        center: crop mode
+    Returns:
+        img: cropped image data
+    """
+    height, width = img.shape[:2]
+    size = target_size
+    if center == True:
+        w_start = (width - size) // 2
+        h_start = (height - size) // 2
+    else:
+        w_start = np.random.randint(0, width - size + 1)
+        h_start = np.random.randint(0, height - size + 1)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img[h_start:h_end, w_start:w_end, :]
+    return img
+def create_mixup_reader(settings, rd):
+    """
+    """
+    class context:
+        tmp_mix = []
+        tmp_l1 = []
+        tmp_l2 = []
+        tmp_lam = []
+    alpha = settings.mixup_alpha
+    def fetch_data():
+        for item in rd():
+            yield item
+    def mixup_data():
+        for data_list in fetch_data():
+            if alpha > 0.:
+                lam = np.random.beta(alpha, alpha)
+            else:
+                lam = 1.
+            l1 = np.array(data_list)
+            l2 = np.random.permutation(l1)
+            mixed_l = [
+                l1[i][0] * lam + (1 - lam) * l2[i][0] for i in range(len(l1))
+            ]
+            yield (mixed_l, l1, l2, lam)
+    def mixup_reader():
+        for context.tmp_mix, context.tmp_l1, context.tmp_l2, context.tmp_lam in mixup_data(
+        ):
+            for i in range(len(context.tmp_mix)):
+                mixed_l = context.tmp_mix[i]
+                l1 = context.tmp_l1[i]
+                l2 = context.tmp_l2[i]
+                lam = context.tmp_lam
+                yield (mixed_l, int(l1[1]), int(l2[1]), float(lam))
+    return mixup_reader
+def process_image(sample, settings, mode, color_jitter, rotate):
+    """ process_image """
+    mean = settings.image_mean
+    std = settings.image_std
+    crop_size = settings.crop_size
+    img_path = sample[0]
+    img = cv2.imread(img_path)
+    if mode == 'train':
+        if rotate:
+            img = rotate_image(img)
+        if crop_size > 0:
+            img = random_crop(
+                img, crop_size, settings, interpolation=settings.interpolation)
+        if color_jitter:
+            img = distort_color(img)
+        if np.random.randint(0, 2) == 1:
+            img = img[:, ::-1, :]
+    else:
+        if crop_size > 0:
+            target_size = settings.resize_short_size
+            img = resize_short(
+                img, target_size, interpolation=settings.interpolation)
+            img = crop_image(img, target_size=crop_size, center=True)
+    img = img[:, :, ::-1]
+    if 'use_aa' in settings and settings.use_aa and mode == 'train':
+        img = np.ascontiguousarray(img)
+        img = Image.fromarray(img)
+        img = policy(img)
+        img = np.asarray(img)
+    img = img.astype('float32').transpose((2, 0, 1)) / 255
+    img_mean = np.array(mean).reshape((3, 1, 1))
+    img_std = np.array(std).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+    if mode == 'train' or mode == 'val':
+        return (img, sample[1])
+    elif mode == 'test':
+        return (img, )
+def process_batch_data(input_data, settings, mode, color_jitter, rotate):
+    batch_data = []
+    for sample in input_data:
+        if os.path.isfile(sample[0]):
+            batch_data.append(
+                process_image(sample, settings, mode, color_jitter, rotate))
+        else:
+            print("File not exist : %s" % sample[0])
+    return batch_data
+class ImageNetReader:
+    def __init__(self, seed=None, place_num=1):
+        self.shuffle_seed = seed
+        self.place_num = place_num
+    def set_shuffle_seed(self, seed):
+        assert isinstance(seed, int), "shuffle seed must be int"
+        self.shuffle_seed = seed
+    def _reader_creator(self,
+                        settings,
+                        file_list,
+                        mode,
+                        shuffle=False,
+                        color_jitter=False,
+                        rotate=False,
+                        data_dir=None):
+        num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+        if mode == 'test':
+            batch_size = 1
+        else:
+            batch_size = settings.batch_size / self.place_num
+        def reader():
+            def read_file_list():
+                with open(file_list) as flist:
+                    full_lines = [line.strip() for line in flist]
+                    if mode != "test" and len(full_lines) < settings.batch_size:
+                        print(
+                            "Warning: The number of the whole data ({}) is smaller than the batch_size ({}), and drop_last is turnning on, so nothing  will feed in program, Terminated now. Please reset batch_size to a smaller number or feed more data!"
+                            .format(len(full_lines), settings.batch_size))
+                        os._exit(1)
+                    if num_trainers > 1 and mode == "train":
+                        assert self.shuffle_seed is not None, "multiprocess train, shuffle seed must be set!"
+                        np.random.RandomState(self.shuffle_seed).shuffle(
+                            full_lines)
+                    elif shuffle:
+                        assert self.shuffle_seed is not None, "multiprocess train, shuffle seed must be set!"
+                        np.random.RandomState(self.shuffle_seed).shuffle(
+                            full_lines)
+                batch_data = []
+                for line in full_lines:
+                    img_path, label = line.split()
+                    img_path = os.path.join(data_dir, img_path)
+                    batch_data.append([img_path, int(label)])
+                    if len(batch_data) == batch_size:
+                        if mode == 'train' or mode == 'val' or mode == 'test':
+                            yield batch_data
+                        batch_data = []
+            return read_file_list
+        data_reader = reader()
+        if mode == 'train' and num_trainers > 1:
+            assert self.shuffle_seed is not None, \
+                "If num_trainers > 1, the shuffle_seed must be set, because " \
+                "the order of batch data generated by reader " \
+                "must be the same in the respective processes."
+            data_reader = paddle.fluid.contrib.reader.distributed_batch_reader(
+                data_reader)
+        mapper = functools.partial(
+            process_batch_data,
+            settings=settings,
+            mode=mode,
+            color_jitter=color_jitter,
+            rotate=rotate)
+        ret = fluid.io.xmap_readers(
+            mapper,
+            data_reader,
+            settings.reader_thread,
+            settings.reader_buf_size,
+            order=False)
+        return ret
+    def train(self, settings):
+        """Create a reader for trainning
+        Args:
+            settings: arguments
+        Returns:
+            train reader
+        """
+        file_list = os.path.join(settings.data_dir, 'train_list.txt')
+        assert os.path.isfile(
+            file_list), "{} doesn't exist, please check data list path".format(
+                file_list)
+        if 'use_aa' in settings and settings.use_aa:
+            global policy
+            policy = ImageNetPolicy()
+        reader = self._reader_creator(
+            settings,
+            file_list,
+            'train',
+            shuffle=True,
+            color_jitter=False,
+            rotate=False,
+            data_dir=settings.data_dir)
+        if settings.use_mixup == True:
+            reader = create_mixup_reader(settings, reader)
+            reader = fluid.io.batch(
+                reader,
+                batch_size=int(settings.batch_size / self.place_num),
+                drop_last=True)
+        return reader
+    def val(self, settings):
+        """Create a reader for eval
+        Args:
+            settings: arguments
+        Returns:
+            eval reader
+        """
+        file_list = os.path.join(settings.data_dir, 'val_list.txt')
+        assert os.path.isfile(
+            file_list), "{} doesn't exist, please check data list path".format(
+                file_list)
+        return self._reader_creator(
+            settings,
+            file_list,
+            'val',
+            shuffle=False,
+            data_dir=settings.data_dir)
+    def test(self, settings):
+        """Create a reader for testing
+        Args:
+            settings: arguments
+        Returns:
+            test reader
+        """
+        file_list = os.path.join(settings.data_dir, 'val_list.txt')
+        assert os.path.isfile(
+            file_list), "{} doesn't exist, please check data list path".format(
+                file_list)
+        return self._reader_creator(
+            settings,
+            file_list,
+            'test',
+            shuffle=False,
+            data_dir=settings.data_dir)
--- a/demo/once_for_all/start.sh
+++ b/demo/once_for_all/start.sh
+export PYTHONPATH=$PWD:$PYTHONPATH
+python -m paddle.distributed.launch --log_dir my_log train.py \
+    --use_data_parallel=True \
+    --batch_size=2048 \
+    --lr=1e-3 \
+    --l2_decay=3e-5 \
+    --total_images=1281167 \
+    --class_dim=1000 \
+    --image_shape=3,224,224 \
+    --model_save_dir=output.ofa.kernel \
+    --lr_strategy=piecewise_decay \
+    --num_epochs=360 \
+    --data_dir=./data/ILSVRC2012 \
+    --model=once_for_all_kernel \
+    --use_aa=False \
+    --checkpoint=./_ofa_epoch359
--- a/demo/once_for_all/train.py
+++ b/demo/once_for_all/train.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import time
+import sys
+import math
+import argparse
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid import framework
+import reader
+from utils import *
+from dy_models import MobileNetV1, MobileNetV2, MobileNetV3_dy, OFA_kernel
+args = parse_args()
+if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
+    print_arguments(args)
+def dy_token_gene(ori_token, alter=[3, 5, 7], num_of_changes=1):
+    kernel_token = tuple(ori_token[:20])
+    exp_token = tuple(ori_token[20:40])
+    depth_token = tuple(ori_token[40:45])
+    dy_token = []
+    for depth_id in range(len(depth_token)):
+        for repeat_time in range(depth_token[depth_id]):
+            dy_token.append(kernel_token[depth_id * 4 + repeat_time])
+    idx_lis = np.random.choice(len(dy_token), num_of_changes, replace=False)
+    for idx in idx_lis:
+        while True:
+            tmp_kernel = alter[np.random.choice(len(alter))]
+            if tmp_kernel != dy_token[idx]:
+                dy_token[idx] = tmp_kernel
+                break
+    return dy_token
+def dy_token_gene_v2(ori_token, alter=[3, 5, 7], position_from_end=-1):
+    kernel_token = tuple(ori_token[:20])
+    exp_token = tuple(ori_token[20:40])
+    depth_token = tuple(ori_token[40:45])
+    dy_token = []
+    dy_trainable = []
+    for depth_id in range(len(depth_token)):
+        for repeat_time in range(depth_token[depth_id]):
+            dy_token.append(kernel_token[depth_id * 4 + repeat_time])
+            dy_trainable.append(False)
+    dy_token[position_from_end] = alter[0]
+    dy_trainable[position_from_end] = True
+    for i in range(abs(position_from_end) - 1):
+        dy_token[position_from_end + i + 1] = alter[np.random.choice(
+            len(alter))]
+    return dy_token, dy_trainable
+def eval(net, test_data_loader, eop, position_from_end=1):
+    total_loss = 0.0
+    total_acc1 = 0.0
+    total_acc5 = 0.0
+    total_sample = 0
+    t_last = 0
+    token = [
+        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3,
+        3, 4, 3, 4, 3, 6, 4, 4, 4, 4, 6, 3, 6, 4, 6, 3, 2, 3, 3, 3, 4
+    ]
+    place_num = paddle.fluid.core.get_cuda_device_count(
+    ) if args.use_gpu else int(os.environ.get('CPU_NUM', 1))
+    for i, (img, label) in enumerate(test_data_loader()):
+        t1 = time.time()
+        label = to_variable(label.numpy().astype('int64').reshape(
+            int(args.batch_size // place_num), 1))
+        if args.model == "once_for_all_kernel":
+            dy_token, dy_trainable = dy_token_gene_v2(
+                token, alter=[5, 7], position_from_end=position_from_end)
+            print(dy_token)
+            out = net(img, dy_token=dy_token, dy_trainable=dy_trainable)
+        else:
+            out = net(img)
+        softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+        loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
+        avg_loss = fluid.layers.mean(x=loss)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        t2 = time.time()
+        print( "test | epoch id: %d, batch id: %d, avg_loss %0.5f acc_top1 %0.5f acc_top5 %0.5f %2.4f sec read_t:%2.4f" % \
+                (eop, i, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), t2 - t1 , t1 - t_last))
+        sys.stdout.flush()
+        total_loss += avg_loss.numpy()
+        total_acc1 += acc_top1.numpy()
+        total_acc5 += acc_top5.numpy()
+        total_sample += 1
+        t_last = time.time()
+    print("final eval loss %0.3f acc1 %0.3f acc5 %0.3f" % \
+          (total_loss / total_sample, \
+           total_acc1 / total_sample, total_acc5 / total_sample))
+    sys.stdout.flush()
+def train_mobilenet():
+    if not args.use_gpu:
+        place = fluid.CPUPlace()
+    elif not args.use_data_parallel:
+        place = fluid.CUDAPlace(0)
+    else:
+        place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
+    with fluid.dygraph.guard(place):
+        # 1. init net and optimizer
+        place_num = paddle.fluid.core.get_cuda_device_count(
+        ) if args.use_gpu else int(os.environ.get('CPU_NUM', 1))
+        if args.ce:
+            print("ce mode")
+            seed = 33
+            np.random.seed(seed)
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+        if args.use_data_parallel:
+            strategy = fluid.dygraph.parallel.prepare_context()
+        if args.model == "MobileNetV1":
+            net = MobileNetV1(class_dim=args.class_dim, scale=1.0)
+            model_path_pre = 'mobilenet_v1'
+        elif args.model == "MobileNetV2":
+            net = MobileNetV2(class_dim=args.class_dim, scale=1.0)
+            model_path_pre = 'mobilenet_v2'
+        elif args.model == "MobileNetV3":
+            net = MobileNetV3_dy(
+                class_dim=args.class_dim, scale=1.0, model_name='large')
+            model_path_pre = 'mobilenet_v3'
+        elif args.model == "once_for_all":
+            token = [
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3,
+                3, 3, 3, 3, 4, 3, 4, 3, 6, 4, 4, 4, 4, 6, 3, 6, 4, 6, 3, 2, 3,
+                3, 3, 4
+            ]
+            net = OFA_kernel(
+                class_dim=args.class_dim,
+                scale=1.0,
+                model_name='large',
+                token=token,
+                ofa_mode=False)
+            model_path_pre = 'ofa'
+        elif args.model == "once_for_all_kernel":
+            token = [
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3,
+                3, 3, 3, 3, 4, 3, 4, 3, 6, 4, 4, 4, 4, 6, 3, 6, 4, 6, 3, 2, 3,
+                3, 3, 4
+            ]
+            net = OFA_kernel(
+                class_dim=args.class_dim,
+                scale=1.0,
+                model_name='large',
+                token=token,
+                ofa_mode='kernel',
+                trainable_besides_trans=False)
+            model_path_pre = 'ofa_kernel'
+        else:
+            print(
+                "wrong model name, please try model = MobileNetV1 or MobileNetV2"
+            )
+            exit()
+        optimizer = create_optimizer(args=args, parameter_list=net.parameters())
+        if args.use_data_parallel:
+            net = fluid.dygraph.parallel.DataParallel(net, strategy)
+        # 2. load checkpoint
+        if args.checkpoint:
+            assert os.path.exists(args.checkpoint + ".pdparams"), \
+                "Given dir {}.pdparams not exist.".format(args.checkpoint)
+            assert os.path.exists(args.checkpoint + ".pdopt"), \
+                "Given dir {}.pdopt not exist.".format(args.checkpoint)
+            para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint)
+            if args.model == "once_for_all_kernel":
+                inner_state_dict = net.state_dict()
+                for name, para in inner_state_dict.items():
+                    key_name = name
+                    if key_name in para_dict:
+                        print(key_name)
+                        para.set_value(para_dict[key_name])
+            else:
+                net.set_dict(para_dict)
+                optimizer.set_dict(opti_dict)
+        # 3. reader
+        train_data_loader, train_data = utility.create_data_loader(
+            is_train=True, args=args)
+        test_data_loader, test_data = utility.create_data_loader(
+            is_train=False, args=args)
+        num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+        imagenet_reader = reader.ImageNetReader(seed=0, place_num=place_num)
+        train_reader = imagenet_reader.train(settings=args)
+        test_reader = imagenet_reader.val(settings=args)
+        train_data_loader.set_sample_list_generator(train_reader, place)
+        test_data_loader.set_sample_list_generator(test_reader, place)
+        with open('./dy_token.txt', 'r+') as f:
+            lines = f.readlines()
+        eval_sign = False
+        if eval_sign:
+            net.eval()
+            eval(net, test_data_loader, 1, -1)
+            exit()
+        # 4. train loop
+        total_batch_num = 0  #this is for benchmark
+        # Hyper-Params
+        total_epoch_each_layer = 15
+        fixed_epoch_each_layer = 10
+        position = 0
+        count = 0
+        for eop in range(args.num_epochs):
+            if num_trainers > 1:
+                imagenet_reader.set_shuffle_seed(eop + (
+                    args.random_seed if args.random_seed else 0))
+            net.train()
+            total_loss = 0.0
+            total_acc1 = 0.0
+            total_acc5 = 0.0
+            total_sample = 0
+            batch_id = 0
+            t_last = 0
+            if eop % total_epoch_each_layer == 0:
+                position += 1
+                count = 0
+                with open(
+                        os.path.join('./token', str(position) + '.txt'),
+                        'r+') as f:
+                    lines = f.readlines()
+            count += 1
+            # 4.1 for each batch, call net() , backward(), and minimize()
+            for idx, (img, label) in enumerate(train_data_loader()):
+                t1 = time.time()
+                if args.max_iter and total_batch_num == args.max_iter:
+                    return
+                label = to_variable(label.numpy().astype('int64').reshape(
+                    int(args.batch_size // place_num), 1))
+                t_start = time.time()
+                # 4.1.1 call net()
+                if args.model == "once_for_all_kernel":
+                    dy_token = [int(x) for x in lines[idx].strip().split(',')]
+                    if count <= fixed_epoch_each_layer:
+                        dy_trainable = [False for _ in range(len(dy_token))]
+                        dy_trainable[0 - position] = True
+                    else:
+                        dy_trainable = [False] * (len(dy_token) - position
+                                                  ) + [True] * position
+                    if idx < 10:
+                        print(dy_token)
+                        print(dy_trainable)
+                    out = net(img, dy_token=dy_token, dy_trainable=dy_trainable)
+                else:
+                    out = net(img)
+                t_end = time.time()
+                softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+                loss = fluid.layers.cross_entropy(
+                    input=softmax_out, label=label)
+                avg_loss = fluid.layers.mean(x=loss)
+                acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+                acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+                t_start_back = time.time()
+                # 4.1.2 call backward()
+                if args.use_data_parallel:
+                    avg_loss = net.scale_loss(avg_loss)
+                    avg_loss.backward()
+                    net.apply_collective_grads()
+                else:
+                    avg_loss.backward()
+                t_end_back = time.time()
+                # 4.1.3 call minimize()
+                optimizer.minimize(avg_loss)
+                net.clear_gradients()
+                t2 = time.time()
+                train_batch_elapse = t2 - t1
+                if batch_id % args.print_step == 0:
+                    print( "epoch id: %d, batch step: %d, avg_loss %0.5f, acc_top1 %0.5f, acc_top5 %0.5f, lr %f, %2.4f sec, net_t:%2.4f, back_t:%2.4f, read_t:%2.4f" % \
+                            (eop, batch_id, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), optimizer._global_learning_rate().numpy(), train_batch_elapse,
+                              t_end - t_start, t_end_back - t_start_back,  t1 - t_last))
+                    sys.stdout.flush()
+                total_loss += avg_loss.numpy()
+                total_acc1 += acc_top1.numpy()
+                total_acc5 += acc_top5.numpy()
+                total_sample += 1
+                batch_id += 1
+                t_last = time.time()
+                # NOTE: used for benchmark
+                total_batch_num = total_batch_num + 1
+            if args.ce:
+                print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample))
+                print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample))
+                print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample))
+            print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \
+                  (eop, batch_id, total_loss / total_sample, \
+                   total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse))
+            # 4.2 save checkpoint
+            save_parameters = (not args.use_data_parallel) or (
+                args.use_data_parallel and
+                fluid.dygraph.parallel.Env().local_rank == 0)
+            if save_parameters:
+                if not os.path.isdir(args.model_save_dir):
+                    os.makedirs(args.model_save_dir)
+                model_path = os.path.join(
+                    args.model_save_dir,
+                    "_" + model_path_pre + "_epoch{}".format(eop))
+                fluid.dygraph.save_dygraph(net.state_dict(), model_path)
+                fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path)
+            # 4.3 validation
+            net.eval()
+            eval(net, test_data_loader, eop, 0 - position)
+        # 5. save final results
+        save_parameters = (not args.use_data_parallel) or (
+            args.use_data_parallel and
+            fluid.dygraph.parallel.Env().local_rank == 0)
+        if save_parameters:
+            model_path = os.path.join(args.model_save_dir,
+                                      "_" + model_path_pre + "_final")
+            fluid.dygraph.save_dygraph(net.state_dict(), model_path)
+if __name__ == '__main__':
+    train_mobilenet()
--- a/demo/once_for_all/utils/__init__.py
+++ b/demo/once_for_all/utils/__init__.py
+from .optimizer import Optimizer, create_optimizer
+from .utility import add_arguments, print_arguments, parse_args, check_gpu, check_args, check_version, init_model, save_model, create_data_loader, print_info, best_strategy_compiled, init_model, save_model, ExponentialMovingAverage
\ No newline at end of file
--- a/demo/once_for_all/utils/autoaugment.py
+++ b/demo/once_for_all/utils/autoaugment.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+"""
+This code is based on https://github.com/DeepVoltaire/AutoAugment/blob/master/autoaugment.py
+"""
+from PIL import Image, ImageEnhance, ImageOps
+import numpy as np
+import random
+class ImageNetPolicy(object):
+    """ Randomly choose one of the best 24 Sub-policies on ImageNet.
+        Example:
+        >>> policy = ImageNetPolicy()
+        >>> transformed = policy(image)
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     ImageNetPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.4, "posterize", 8, 0.6, "rotate", 9, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "posterize", 7, 0.6, "posterize", 6, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.4, "equalize", 4, 0.8, "rotate", 8, fillcolor),
+            SubPolicy(0.6, "solarize", 3, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.8, "posterize", 5, 1.0, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "rotate", 3, 0.6, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "equalize", 8, 0.4, "posterize", 6, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 0.4, "color", 0, fillcolor),
+            SubPolicy(0.4, "rotate", 9, 0.6, "equalize", 2, fillcolor),
+            SubPolicy(0.0, "equalize", 7, 0.8, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 1.0, "color", 2, fillcolor),
+            SubPolicy(0.8, "color", 8, 0.8, "solarize", 7, fillcolor),
+            SubPolicy(0.4, "sharpness", 7, 0.6, "invert", 8, fillcolor),
+            SubPolicy(0.6, "shearX", 5, 1.0, "equalize", 9, fillcolor),
+            SubPolicy(0.4, "color", 0, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor)
+        ]
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+    def __repr__(self):
+        return "AutoAugment ImageNet Policy"
+class CIFAR10Policy(object):
+    """ Randomly choose one of the best 25 Sub-policies on CIFAR10.
+        Example:
+        >>> policy = CIFAR10Policy()
+        >>> transformed = policy(image)
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     CIFAR10Policy(),
+        >>>     transforms.ToTensor()])
+    """
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.1, "invert", 7, 0.2, "contrast", 6, fillcolor),
+            SubPolicy(0.7, "rotate", 2, 0.3, "translateX", 9, fillcolor),
+            SubPolicy(0.8, "sharpness", 1, 0.9, "sharpness", 3, fillcolor),
+            SubPolicy(0.5, "shearY", 8, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.5, "autocontrast", 8, 0.9, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "shearY", 7, 0.3, "posterize", 7, fillcolor),
+            SubPolicy(0.4, "color", 3, 0.6, "brightness", 7, fillcolor),
+            SubPolicy(0.3, "sharpness", 9, 0.7, "brightness", 9, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.5, "equalize", 1, fillcolor),
+            SubPolicy(0.6, "contrast", 7, 0.6, "sharpness", 5, fillcolor),
+            SubPolicy(0.7, "color", 7, 0.5, "translateX", 8, fillcolor),
+            SubPolicy(0.3, "equalize", 7, 0.4, "autocontrast", 8, fillcolor),
+            SubPolicy(0.4, "translateY", 3, 0.2, "sharpness", 6, fillcolor),
+            SubPolicy(0.9, "brightness", 6, 0.2, "color", 8, fillcolor),
+            SubPolicy(0.5, "solarize", 2, 0.0, "invert", 3, fillcolor),
+            SubPolicy(0.2, "equalize", 0, 0.6, "autocontrast", 0, fillcolor),
+            SubPolicy(0.2, "equalize", 8, 0.8, "equalize", 4, fillcolor),
+            SubPolicy(0.9, "color", 9, 0.6, "equalize", 6, fillcolor),
+            SubPolicy(0.8, "autocontrast", 4, 0.2, "solarize", 8, fillcolor),
+            SubPolicy(0.1, "brightness", 3, 0.7, "color", 0, fillcolor),
+            SubPolicy(0.4, "solarize", 5, 0.9, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "translateY", 9, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.9, "autocontrast", 2, 0.8, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.1, "invert", 3, fillcolor),
+            SubPolicy(0.7, "translateY", 9, 0.9, "autocontrast", 1, fillcolor)
+        ]
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+    def __repr__(self):
+        return "AutoAugment CIFAR10 Policy"
+class SVHNPolicy(object):
+    """ Randomly choose one of the best 25 Sub-policies on SVHN.
+        Example:
+        >>> policy = SVHNPolicy()
+        >>> transformed = policy(image)
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     SVHNPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.9, "shearX", 4, 0.2, "invert", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.7, "invert", 5, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.6, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 3, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "equalize", 1, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.8, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.4, "invert", 5, fillcolor),
+            SubPolicy(0.9, "shearY", 5, 0.2, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 6, 0.8, "autocontrast", 1, fillcolor),
+            SubPolicy(0.6, "equalize", 3, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.3, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "shearY", 8, 0.7, "invert", 4, fillcolor),
+            SubPolicy(0.9, "equalize", 5, 0.6, "translateY", 6, fillcolor),
+            SubPolicy(0.9, "invert", 4, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.3, "contrast", 3, 0.8, "rotate", 4, fillcolor),
+            SubPolicy(0.8, "invert", 5, 0.0, "translateY", 2, fillcolor),
+            SubPolicy(0.7, "shearY", 6, 0.4, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 0.8, "rotate", 4, fillcolor), SubPolicy(
+                0.3, "shearY", 7, 0.9, "translateX", 3, fillcolor), SubPolicy(
+                    0.1, "shearX", 6, 0.6, "invert", 5, fillcolor), SubPolicy(
+                        0.7, "solarize", 2, 0.6, "translateY", 7, fillcolor),
+            SubPolicy(0.8, "shearY", 4, 0.8, "invert", 8, fillcolor), SubPolicy(
+                0.7, "shearX", 9, 0.8, "translateY", 3, fillcolor), SubPolicy(
+                    0.8, "shearY", 5, 0.7, "autocontrast", 3, fillcolor),
+            SubPolicy(0.7, "shearX", 2, 0.1, "invert", 5, fillcolor)
+        ]
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+    def __repr__(self):
+        return "AutoAugment SVHN Policy"
+class SubPolicy(object):
+    def __init__(self,
+                 p1,
+                 operation1,
+                 magnitude_idx1,
+                 p2,
+                 operation2,
+                 magnitude_idx2,
+                 fillcolor=(128, 128, 128)):
+        ranges = {
+            "shearX": np.linspace(0, 0.3, 10),
+            "shearY": np.linspace(0, 0.3, 10),
+            "translateX": np.linspace(0, 150 / 331, 10),
+            "translateY": np.linspace(0, 150 / 331, 10),
+            "rotate": np.linspace(0, 30, 10),
+            "color": np.linspace(0.0, 0.9, 10),
+            "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int),
+            "solarize": np.linspace(256, 0, 10),
+            "contrast": np.linspace(0.0, 0.9, 10),
+            "sharpness": np.linspace(0.0, 0.9, 10),
+            "brightness": np.linspace(0.0, 0.9, 10),
+            "autocontrast": [0] * 10,
+            "equalize": [0] * 10,
+            "invert": [0] * 10
+        }
+        # from https://stackoverflow.com/questions/5252170/specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+        func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, magnitude * img.size[0] * random.choice([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude * img.size[1] * random.choice([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            # "rotate": lambda img, magnitude: img.rotate(magnitude * random.choice([-1, 1])),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(1 + magnitude * random.choice([-1, 1])),
+            "posterize": lambda img, magnitude: ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude: ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img)
+        }
+        self.p1 = p1
+        self.operation1 = func[operation1]
+        self.magnitude1 = ranges[operation1][magnitude_idx1]
+        self.p2 = p2
+        self.operation2 = func[operation2]
+        self.magnitude2 = ranges[operation2][magnitude_idx2]
+    def __call__(self, img):
+        if random.random() < self.p1:
+            img = self.operation1(img, self.magnitude1)
+        if random.random() < self.p2:
+            img = self.operation2(img, self.magnitude2)
+        return img
--- a/demo/once_for_all/utils/dist_utils.py
+++ b/demo/once_for_all/utils/dist_utils.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import paddle.fluid as fluid
+def nccl2_prepare(args, startup_prog, main_prog):
+    config = fluid.DistributeTranspilerConfig()
+    config.mode = "nccl2"
+    t = fluid.DistributeTranspiler(config=config)
+    envs = args.dist_env
+    t.transpile(
+        envs["trainer_id"],
+        trainers=','.join(envs["trainer_endpoints"]),
+        current_endpoint=envs["current_endpoint"],
+        startup_program=startup_prog,
+        program=main_prog)
+def pserver_prepare(args, train_prog, startup_prog):
+    config = fluid.DistributeTranspilerConfig()
+    config.slice_var_up = args.split_var
+    t = fluid.DistributeTranspiler(config=config)
+    envs = args.dist_env
+    training_role = envs["training_role"]
+    t.transpile(
+        envs["trainer_id"],
+        program=train_prog,
+        pservers=envs["pserver_endpoints"],
+        trainers=envs["num_trainers"],
+        sync_mode=not args.async_mode,
+        startup_program=startup_prog)
+    if training_role == "PSERVER":
+        pserver_program = t.get_pserver_program(envs["current_endpoint"])
+        pserver_startup_program = t.get_startup_program(
+            envs["current_endpoint"],
+            pserver_program,
+            startup_program=startup_prog)
+        return pserver_program, pserver_startup_program
+    elif training_role == "TRAINER":
+        train_program = t.get_trainer_program()
+        return train_program, startup_prog
+    else:
+        raise ValueError(
+            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+        )
+def nccl2_prepare_paddle(trainer_id, startup_prog, main_prog):
+    config = fluid.DistributeTranspilerConfig()
+    config.mode = "nccl2"
+    t = fluid.DistributeTranspiler(config=config)
+    t.transpile(
+        trainer_id,
+        trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'),
+        current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'),
+        startup_program=startup_prog,
+        program=main_prog)
+def prepare_for_multi_process(exe, build_strategy, train_prog):
+    # prepare for multi-process
+    trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0))
+    num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+    if num_trainers < 2: return
+    print("PADDLE_TRAINERS_NUM", num_trainers)
+    print("PADDLE_TRAINER_ID", trainer_id)
+    build_strategy.num_trainers = num_trainers
+    build_strategy.trainer_id = trainer_id
+    # NOTE(zcd): use multi processes to train the model,
+    # and each process use one GPU card.
+    startup_prog = fluid.Program()
+    nccl2_prepare_paddle(trainer_id, startup_prog, train_prog)
+    # the startup_prog are run two times, but it doesn't matter.
+    exe.run(startup_prog)
--- a/demo/once_for_all/utils/optimizer.py
+++ b/demo/once_for_all/utils/optimizer.py
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle.fluid as fluid
+import paddle.fluid.layers.ops as ops
+from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
+class CosineDecayWarmup(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 step_each_epoch,
+                 epochs,
+                 warmup_epochs,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(CosineDecayWarmup, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.step_each_epoch = step_each_epoch
+        self.epochs = epochs
+        self.warmup_epochs = warmup_epochs
+    def step(self):
+        if (self.step_num / self.step_each_epoch) < self.warmup_epochs:
+            decayed_lr = self.learning_rate * (self.step_num / (
+                self.step_each_epoch * self.warmup_epochs))
+        else:
+            decayed_lr = self.learning_rate * 0.5 * (fluid.layers.cos(
+                self.create_lr_var((self.step_num - self.warmup_epochs *
+                                    self.step_each_epoch) * math.pi /
+                                   (self.epochs * self.step_each_epoch))) + 1)
+        return decayed_lr
+class Optimizer(object):
+    """A class used to represent several optimizer methods
+    Attributes:
+        batch_size: batch size on all devices.
+        lr: learning rate.
+        lr_strategy: learning rate decay strategy.
+        l2_decay: l2_decay parameter.
+        momentum_rate: momentum rate when using Momentum optimizer.
+        step_epochs: piecewise decay steps.
+        num_epochs: number of total epochs.
+        total_images: total images.
+        step: total steps in the an epoch.
+    """
+    def __init__(self, args, parameter_list):
+        self.parameter_list = parameter_list
+        self.batch_size = args.batch_size
+        self.lr = args.lr
+        self.lr_strategy = args.lr_strategy
+        self.l2_decay = args.l2_decay
+        self.momentum_rate = args.momentum_rate
+        self.step_epochs = args.step_epochs
+        self.num_epochs = args.num_epochs
+        self.warm_up_epochs = args.warm_up_epochs
+        self.decay_epochs = args.decay_epochs
+        self.decay_rate = args.decay_rate
+        self.total_images = args.total_images
+        self.step = int(math.ceil(float(self.total_images) / self.batch_size))
+    def piecewise_decay(self):
+        """piecewise decay with Momentum optimizer
+            Returns:
+            a piecewise_decay optimizer
+        """
+        bd = [self.step * e for e in self.step_epochs]
+        lr = [self.lr * (0.1**i) for i in range(len(bd) + 1)]
+        learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            parameter_list=self.parameter_list)
+        return optimizer
+    def cosine_decay(self):
+        """cosine decay with Momentum optimizer
+        Returns:
+            a cosine_decay optimizer
+        """
+        learning_rate = fluid.layers.cosine_decay(
+            learning_rate=self.lr,
+            step_each_epoch=self.step,
+            epochs=self.num_epochs)
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            parameter_list=self.parameter_list)
+        return optimizer
+    def cosine_decay_warmup(self):
+        """cosine decay with warmup
+        Returns:
+            a cosine_decay_with_warmup optimizer
+        """
+        learning_rate = CosineDecayWarmup(
+            learning_rate=self.lr,
+            step_each_epoch=self.step,
+            epochs=self.num_epochs,
+            warmup_epochs=5)
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            parameter_list=self.parameter_list)
+        return optimizer
+    def linear_decay(self):
+        """linear decay with Momentum optimizer
+        Returns:
+            a linear_decay optimizer
+        """
+        end_lr = 0
+        learning_rate = fluid.layers.polynomial_decay(
+            self.lr, self.step, end_lr, power=1)
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            parameter_list=self.parameter_list)
+        return optimizer
+    def adam_decay(self):
+        """Adam optimizer
+        Returns: 
+            an adam_decay optimizer
+        """
+        return fluid.optimizer.Adam(
+            learning_rate=self.lr, parameter_list=self.parameter_list)
+    def cosine_decay_RMSProp(self):
+        """cosine decay with RMSProp optimizer
+        Returns: 
+            an cosine_decay_RMSProp optimizer
+        """
+        learning_rate = fluid.layers.cosine_decay(
+            learning_rate=self.lr,
+            step_each_epoch=self.step,
+            epochs=self.num_epochs)
+        optimizer = fluid.optimizer.RMSProp(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            # Apply epsilon=1 on ImageNet dataset.
+            epsilon=1,
+            parameter_list=self.parameter_list)
+        return optimizer
+    def default_decay(self):
+        """default decay
+        Returns:
+            default decay optimizer
+        """
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=self.lr,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            parameter_list=self.parameter_list)
+        return optimizer
+def create_optimizer(args, parameter_list):
+    Opt = Optimizer(args, parameter_list)
+    optimizer = getattr(Opt, args.lr_strategy)()
+    return optimizer
--- a/demo/once_for_all/utils/utility.py
+++ b/demo/once_for_all/utils/utility.py