diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index cb1c28b39b69952f87488a0fe2a9c3ee3bc96387..c94b77dd8c6428909a4a2b62cc19659a8635dcff 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -107,7 +107,7 @@ class DistributedStrategy(object):
         All of the distributed training configurations can be configured in DistributedStrategy,
         such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS), 
         asynchronous update parameter server(ASGD), etc.
-        
+
         DistributedStrategy can be serialized into protobuf file or deserialized from protobuf file
 
         Users who run local training usually configure BuildStrategy and ExecutionStrategy, and 
@@ -128,8 +128,9 @@ class DistributedStrategy(object):
         Serialize current DistributedStrategy to string and save to output file
 
         Examples:
+
           .. code-block:: python
-        
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.dgc = True
@@ -145,6 +146,7 @@ class DistributedStrategy(object):
         Load from prototxt file for DistributedStrategy initialization
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -161,10 +163,11 @@ class DistributedStrategy(object):
         Configure ExecutionStrategy for DistributedStrategy
 
         Examples:
+
           .. code-block:: python
 
             import paddle
-            exe_strategy = paddle.fluid.ExecutionStrategy()
+            exe_strategy = paddle.static.ExecutionStrategy()
             exe_strategy.num_threads = 10
             exe_strategy.num_iteration_per_drop_scope = 10
             exe_strategy.num_iteration_per_run = 10
@@ -195,10 +198,11 @@ class DistributedStrategy(object):
         only if the property is non-distributed strategy.
 
         Examples:
+
           .. code-block:: python
 
             import paddle
-            build_strategy = paddle.fluid.BuildStrategy()
+            build_strategy = paddle.static.BuildStrategy()
             build_strategy.enable_sequential_execution = True
             build_strategy.fuse_elewise_add_act_ops = True
             build_strategy.fuse_bn_act_ops = True
@@ -207,7 +211,7 @@ class DistributedStrategy(object):
             build_strategy.fuse_broadcast_ops = True
             build_strategy.fuse_all_optimizer_ops = True
             build_strategy.enable_inplace = True
-            
+
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.build_strategy = build_strategy
         """
@@ -240,6 +244,7 @@ class DistributedStrategy(object):
         Default value: True
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -248,7 +253,7 @@ class DistributedStrategy(object):
 
             strategy = fleet.DistributedStrategy()
             strategy.a_sync = True  # by default this is True
-            
+
             # code block for defining loss and local optimizer
             # sgd = fleet.distributed_optimizer(optimizer, strategy)
         """
@@ -288,6 +293,7 @@ class DistributedStrategy(object):
             runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -319,6 +325,7 @@ class DistributedStrategy(object):
         Default Value: False
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -360,6 +367,7 @@ class DistributedStrategy(object):
             custom_black_list(list[str]): Users' custom black list which forbidden execution fp16.
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -384,6 +392,7 @@ class DistributedStrategy(object):
         Default value: False
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -401,6 +410,7 @@ class DistributedStrategy(object):
         We note that system overhead is usually lower when sync_nccl_allreduce = True
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -425,6 +435,7 @@ class DistributedStrategy(object):
         allreduce among the leaders of each group
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -450,6 +461,7 @@ class DistributedStrategy(object):
         Default value: number of GPU cards on each single GPU machine
 
         Example:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -472,10 +484,11 @@ class DistributedStrategy(object):
     def sync_batch_norm(self):
         """
         Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes.
-        
+
         Default value: False
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -500,6 +513,7 @@ class DistributedStrategy(object):
         Default value: True
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -524,8 +538,9 @@ class DistributedStrategy(object):
         Default value: 32
 
         Examples:
+
           .. code-block:: python
-        
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.fuse_grad_size_in_MB = 50
@@ -562,8 +577,9 @@ class DistributedStrategy(object):
         Default value: 1
 
         Examples:
+
           .. code-block:: python
-        
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
@@ -594,8 +610,9 @@ class DistributedStrategy(object):
         implementation should have some manually assign checkpoints
 
         Examples:
+
           .. code-block:: python
-        
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.recompute = True
@@ -622,8 +639,9 @@ class DistributedStrategy(object):
         Default value: False
 
         Examples:
+
           .. code-block:: python
-          
+
             import paddle.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.sharding = True
@@ -649,8 +667,9 @@ class DistributedStrategy(object):
             and should be an empirical value decided by your model size and network topology.
 
         Examples:
+
           .. code-block:: python
-        
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.sharding = True
@@ -674,8 +693,9 @@ class DistributedStrategy(object):
         device_guard information in user-defined program.
 
         Examples:
+
           .. code-block:: python
-        
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
@@ -709,8 +729,9 @@ class DistributedStrategy(object):
             **micro_batch**: the number of small batches in each user defined batch
 
         Examples:
+
           .. code-block:: python
-        
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
@@ -736,6 +757,7 @@ class DistributedStrategy(object):
 
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -764,6 +786,7 @@ class DistributedStrategy(object):
             begin_step(int) The step of begining training by localsgd. Default 1.
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -791,6 +814,7 @@ class DistributedStrategy(object):
 
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -821,6 +845,7 @@ class DistributedStrategy(object):
             begin_step(int) The step of begining training by adaptive localsgd. Default 1.
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -848,6 +873,7 @@ class DistributedStrategy(object):
         Default Value: False
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -884,6 +910,7 @@ class DistributedStrategy(object):
                     element will be transmitted.
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -906,6 +933,7 @@ class DistributedStrategy(object):
         Default Value: False
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -935,6 +963,7 @@ class DistributedStrategy(object):
         to model parameters.
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -963,6 +992,7 @@ class DistributedStrategy(object):
             avg(bool): whether to average the gradients of each mini-batch, the default value is `True`
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -989,6 +1019,7 @@ class DistributedStrategy(object):
         Default Value: False
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -1019,6 +1050,7 @@ class DistributedStrategy(object):
         will be exclude from weight decay in lars formula.
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -1048,8 +1080,9 @@ class DistributedStrategy(object):
         [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962).
 
         Default Value: False
-        
+
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -1078,6 +1111,7 @@ class DistributedStrategy(object):
         will be exclude from weight decay in lamb formula.
 
         Examples:
+
           .. code-block:: python
 
             import paddle.distributed.fleet as fleet
@@ -1123,11 +1157,12 @@ class DistributedStrategy(object):
         Default Value: False
 
         Examples:
+
           .. code-block:: python
 
             import paddle
-            import paddle.distributed.fleet as fleet
             paddle.enable_static()
+            import paddle.distributed.fleet as fleet
 
             strategy = fleet.DistributedStrategy()
             strategy.auto = True
@@ -1156,8 +1191,11 @@ class DistributedStrategy(object):
         Default Value: True
 
         Examples:
+
           .. code-block:: python
 
+            import paddle
+            paddle.enable_static()
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.cudnn_exhaustive_search = False
@@ -1187,15 +1225,18 @@ class DistributedStrategy(object):
         Default Value: 4000
 
         Examples:
+
           .. code-block:: python
 
+            import paddle
+            paddle.enable_static()
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.conv_workspace_size_limit = 1024
 
             optimizer = paddle.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        
+
         """
         return self.strategy.conv_workspace_size_limit
 
@@ -1217,8 +1258,11 @@ class DistributedStrategy(object):
         Default Value: True
 
         Examples:
+
           .. code-block:: python
 
+            import paddle
+            paddle.enable_static()
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.cudnn_batchnorm_spatial_persistent = True
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 4db7f70e3cf5c07557259b9286866925c8e631f2..c5be6a7a8bb142ef4fe25a906785560f6b22e93c 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -69,8 +69,11 @@ class Fleet(object):
         Fleet: A Fleet instance
 
     Example for collective training:
+
         .. code-block:: python
 
+            import paddle
+            paddle.enable_static()
             import paddle.distributed.fleet as fleet
 
             fleet.init(is_collective=True)
@@ -86,6 +89,8 @@ class Fleet(object):
 
         .. code-block:: python
 
+            import paddle
+            paddle.enable_static()
             import paddle.distributed.fleet as fleet
 
             fleet.init()
@@ -159,7 +164,7 @@ class Fleet(object):
             .. code-block:: python
 
                 import paddle.distributed.fleet as fleet
-                role = fleet.PaddleCloudRoleMaker
+                role = fleet.PaddleCloudRoleMaker()
                 fleet.init(role)
 
         """
@@ -233,6 +238,7 @@ class Fleet(object):
         Examples:
 
             .. code-block:: python
+
                 import paddle.distributed.fleet as fleet
                 fleet.init()
                 fleet.worker_index()
@@ -246,8 +252,9 @@ class Fleet(object):
 
         Returns:
             int: worker numbers
-        
+
         Examples:
+
             .. code-block:: python
 
                 import paddle.distributed.fleet as fleet
@@ -266,6 +273,7 @@ class Fleet(object):
                   False if not.
 
         Examples:
+
             .. code-block:: python
 
                 import paddle.distributed.fleet as fleet
@@ -283,6 +291,7 @@ class Fleet(object):
             list/string: server endpoints
 
         Examples:
+
             .. code-block:: python
 
                 import paddle.distributed.fleet as fleet
@@ -303,10 +312,12 @@ class Fleet(object):
             int: server number
 
         Examples:
+
             .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            fleet.init()
-            fleet.server_num()
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+                fleet.server_num()
         """
         return len(self._role_maker._get_pserver_endpoints())
 
@@ -318,6 +329,7 @@ class Fleet(object):
             int: node id
 
         Examples:
+
             .. code-block:: python
 
                 import paddle.distributed.fleet as fleet
@@ -335,6 +347,7 @@ class Fleet(object):
             list/string: server endpoints
 
         Examples:
+
             .. code-block:: python
 
                 import paddle.distributed.fleet as fleet
@@ -359,6 +372,7 @@ class Fleet(object):
         Examples:
 
             .. code-block:: python
+
                 import paddle.distributed.fleet as fleet
                 fleet.init()
                 fleet.is_server()
@@ -510,21 +524,21 @@ class Fleet(object):
     def save_persistables(self, executor, dirname, main_program=None, mode=1):
         """
 
-        saves all persistable variables from :code:`main_program` to
+        saves all persistable tensors from :code:`main_program` to
         the folder :code:`dirname`. You can refer to
 
-        The :code:`dirname` is used to specify the folder where persistable variables
-        are going to be saved. If you would like to save variables in separate
+        The :code:`dirname` is used to specify the folder where persistable tensors
+        are going to be saved. If you would like to save tensors in separate
         files, set :code:`filename` None.
 
         Args:
-            executor(Executor): The executor to run for saving persistable variables.
+            executor(Executor): The executor to run for saving persistable tensors.
                                 You can refer to :ref:`api_guide_executor_en` for
                                 more details.
 
             dirname(str, optional): The saving directory path.
                                 When you need to save the parameter to the memory, set it to None.
-            main_program(Program, optional): The program whose persistbale variables will
+            main_program(Program, optional): The program whose persistbale tensors will
                                              be saved. Default: None.
 
 
@@ -535,16 +549,17 @@ class Fleet(object):
 
             .. code-block:: text
 
+                import paddle
+                paddle.enable_static()
                 import paddle.distributed.fleet as fleet
-                import paddle.fluid as fluid
 
                 fleet.init()
 
                 # build net
                 # fleet.distributed_optimizer(...)
 
-                exe = fluid.Executor(fluid.CPUPlace())
-                fleet.save_persistables(exe, "dirname", fluid.default_main_program())
+                exe = paddle.static.Executor(paddle.CPUPlace())
+                fleet.save_persistables(exe, "dirname", paddle.static.default_main_program())
 
         """
 
@@ -569,9 +584,9 @@ class Fleet(object):
 
             .. code-block:: python
 
+                import paddle
                 import paddle.distributed.fleet as fleet
-                role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
-                fleet.init(role)
+                fleet.init(is_collective=True)
                 strategy = fleet.DistributedStrategy()
                 optimizer = paddle.optimizer.SGD(learning_rate=0.001)
                 optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
@@ -621,23 +636,20 @@ class Fleet(object):
                     def forward(self, x):
                         return self._linear2(self._linear1(x))
 
-                # 1. enable dynamic mode
-                paddle.disable_static()
-
-                # 2. initialize fleet environment
+                # 1. initialize fleet environment
                 fleet.init(is_collective=True)
 
-                # 3. create layer & optimizer
+                # 2. create layer & optimizer
                 layer = LinearNet()
                 loss_fn = nn.MSELoss()
                 adam = paddle.optimizer.Adam(
                     learning_rate=0.001, parameters=layer.parameters())
 
-                # 4. get data_parallel model using fleet
+                # 3. get data_parallel model using fleet
                 adam = fleet.distributed_optimizer(adam)
                 dp_layer = fleet.distributed_model(layer)
 
-                # 5. run layer
+                # 4. run layer
                 inputs = paddle.randn([10, 10], 'float32')
                 outputs = dp_layer(inputs)
                 labels = paddle.randn([10, 1], 'float32')
@@ -675,11 +687,10 @@ class Fleet(object):
                 import paddle
                 from paddle.distributed import fleet
 
-                paddle.disable_static()
                 fleet.init(is_collective=True)
 
                 value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.fluid.dygraph.to_variable(value)
+                a = paddle.to_tensor(value)
 
                 layer = paddle.nn.Linear(13, 5)
                 adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
@@ -710,11 +721,10 @@ class Fleet(object):
                 import paddle
                 from paddle.distributed import fleet
 
-                paddle.disable_static()
                 fleet.init(is_collective=True)
 
                 value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.fluid.dygraph.to_variable(value)
+                a = paddle.to_tensor(value)
 
                 layer = paddle.nn.Linear(13, 5)
                 adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
@@ -722,9 +732,9 @@ class Fleet(object):
                 adam = fleet.distributed_optimizer(adam)
                 dp_layer = fleet.distributed_model(layer)
                 state_dict = adam.state_dict()
-                paddle.framework.save(state_dict, "paddle_dy")
-                para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
-                adam.set_state_dict(opti_state_dict)
+                paddle.save(state_dict, "paddle_dy")
+                para_state_dict = paddle.load("paddle_dy")
+                adam.set_state_dict(para_state_dict)
         """
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.set_state_dict(state_dict)
@@ -748,11 +758,10 @@ class Fleet(object):
                 import paddle
                 from paddle.distributed import fleet
 
-                paddle.disable_static()
                 fleet.init(is_collective=True)
 
                 value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.fluid.dygraph.to_variable(value)
+                a = paddle.to_tensor(value)
 
                 layer = paddle.nn.Linear(13, 5)
                 adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
@@ -785,17 +794,17 @@ class Fleet(object):
             float: The learning rate of the current step.
 
         Examples:
+
             .. code-block:: python
 
                 import numpy as np
                 import paddle
                 from paddle.distributed import fleet
 
-                paddle.disable_static()
                 fleet.init(is_collective=True)
 
                 value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.fluid.dygraph.to_variable(value)
+                a = paddle.to_tensor(value)
 
                 layer = paddle.nn.Linear(13, 5)
                 adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
@@ -819,6 +828,7 @@ class Fleet(object):
             None
 
         Examples:
+
             .. code-block:: python
 
                 import paddle
@@ -834,23 +844,20 @@ class Fleet(object):
                     def forward(self, x):
                         return self._linear2(self._linear1(x))
 
-                # 1. enable dynamic mode
-                paddle.disable_static()
-
-                # 2. initialize fleet environment
+                # 1. initialize fleet environment
                 fleet.init(is_collective=True)
 
-                # 3. create layer & optimizer
+                # 2. create layer & optimizer
                 layer = LinearNet()
                 loss_fn = nn.MSELoss()
                 adam = paddle.optimizer.Adam(
                     learning_rate=0.001, parameters=layer.parameters())
 
-                # 4. get data_parallel model using fleet
+                # 3. get data_parallel model using fleet
                 adam = fleet.distributed_optimizer(adam)
                 dp_layer = fleet.distributed_model(layer)
 
-                # 5. run layer
+                # 4. run layer
                 inputs = paddle.randn([10, 10], 'float32')
                 outputs = dp_layer(inputs)
                 labels = paddle.randn([10, 1], 'float32')
@@ -878,6 +885,7 @@ class Fleet(object):
             None
 
         Examples:
+
             .. code-block:: python
 
                 import paddle
@@ -893,23 +901,20 @@ class Fleet(object):
                     def forward(self, x):
                         return self._linear2(self._linear1(x))
 
-                # 1. enable dynamic mode
-                paddle.disable_static()
-
-                # 2. initialize fleet environment
+                # 1. initialize fleet environment
                 fleet.init(is_collective=True)
 
-                # 3. create layer & optimizer
+                # 2. create layer & optimizer
                 layer = LinearNet()
                 loss_fn = nn.MSELoss()
                 adam = paddle.optimizer.Adam(
                     learning_rate=0.001, parameters=layer.parameters())
 
-                # 4. get data_parallel model using fleet
+                # 3. get data_parallel model using fleet
                 adam = fleet.distributed_optimizer(adam)
                 dp_layer = fleet.distributed_model(layer)
 
-                # 5. run layer
+                # 4. run layer
                 inputs = paddle.randn([10, 10], 'float32')
                 outputs = dp_layer(inputs)
                 labels = paddle.randn([10, 1], 'float32')
@@ -962,38 +967,44 @@ class Fleet(object):
         Add distributed operations to minimize ``loss`` by updating ``parameter_list``.
 
         Args:
-            loss (Variable): A ``Variable`` containing the value to minimize.
+            loss (Tensor): A ``Tensor`` containing the value to minimize.
             startup_program (Program, optional): :ref:`api_fluid_Program` for
                 initializing parameters in ``parameter_list``. The default value
                 is None, at this time :ref:`api_fluid_default_startup_program` will be used.
-            parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update
+            parameter_list (Iterable, optional): Iterable of ``Tensor`` or ``Tensor.name`` to update
                 to minimize ``loss``. The default value is None, at this time all parameters
                 will be updated.
-            no_grad_set (set, optional): Set of ``Variable``  or ``Variable.name`` that don't need
+            no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
                 to be updated. The default value is None.
 
         Returns:
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
-            by minimize and a list of (param, grad) variable pairs, param is
+            by minimize and a list of (param, grad) tensor pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
             The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
             indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
+
             .. code-block:: python
 
                 import paddle
+                paddle.enable_static()
                 import paddle.distributed.fleet as fleet
+                import paddle.nn.functional as F
+
+                hid_dim = 10
+                label_dim = 2
+                input_x = paddle.static.data(name='x', shape=[None, 13], dtype='float32')
+                input_y = paddle.static.data(name='y', shape=[None, 1], dtype='int64')
+                fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh')
+                fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh')
+                prediction = paddle.static.nn.fc(x=[fc_2], size=label_dim, activation='softmax')
+                cost = F.cross_entropy(input=prediction, label=input_y)
+                avg_cost = paddle.mean(x=cost)
 
-                fc_1 = paddle.fluid.layers.fc(input=input_x, size=hid_dim, act='tanh')
-                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=hid_dim, act='tanh')
-                prediction = paddle.fluid.layers.fc(input=[fc_2], size=label_dim, act='softmax')
-                cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y)
-                avg_cost = paddle.fluid.layers.mean(x=cost)
-
-                role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
-                fleet.init(role)
+                fleet.init(is_collective=True)
                 strategy = fleet.DistributedStrategy()
                 optimizer = paddle.optimizer.SGD(learning_rate=0.001)
                 optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)