diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index 5b8b70c5641672f3904f657c9a087dc3156ee525..601ae4f8c6de11b0bf25d4f9a92ef8eada67be3d 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -17,7 +17,6 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 5b82805ad9391d82fd9b6cf020658cb0a2801c1d..2ab7da2d57c7a55ec496390b7364c55b56e8d65e 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -134,6 +134,7 @@ static const char kParams[] = "params";
 static const char kLocalScopes[] = "local_scopes";
 static const char kStrategy[] = "strategy";
 static const char kNumTrainers[] = "num_trainers";
+static const char kNumLossScaled[] = "num_loss_scaled";
 
 void MultiDevSSAGraphBuilder::Init() const {
   all_vars_.clear();
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index edb7b5e70ac0a082fac9906bb2fa6bc2064ffde9..f43207908762de1f0ab1e2a55ce06fadb1a67e63 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -41,10 +41,12 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       Scope &local_scope = scope->NewScope();
       *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
           &local_scope;
+
       for (auto &info : var_infos_) {
         if (scope->FindVar(info.name_) != nullptr) {
           continue;
         }
+
         if (info.persistable_) {  // Persistable
           InitializeVariable(scope->Var(info.name_), info.type_);
         } else {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index b45afbc0461d212095f422a71e0eee27572a2f39..24da56c09e3e0f3894d58e5af8838c98e3e1e67c 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -24,7 +24,6 @@
 #include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc
index 7de6025a28a1992786b73d53d456984e0cf418c5..30da029ca2a90e7faa6288557ff2f1aeb21cc1c6 100644
--- a/paddle/fluid/framework/details/var_handle.cc
+++ b/paddle/fluid/framework/details/var_handle.cc
@@ -20,7 +20,7 @@ namespace details {
 
 VarHandleBase::~VarHandleBase() {}
 
-VarHandle::~VarHandle() { VLOG(5) << "deleting var handle " << DebugString(); }
+VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); }
 
 std::string VarHandle::DebugString() const {
   std::stringstream ss;
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 10ae3a1c74842ca02002d40dac1c1f54627479c6..d2a393b3f19e9aab79098757dae663d030b0fa2b 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -49,6 +49,7 @@ class Node {
  public:
   virtual ~Node() {
     if (!wrapper_.empty()) {
+      VLOG(4) << "ir::Node deleting a wrapper node " << Name();
       wrapper_deleter_();
     }
   }
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
index eff76ce0d49df52b0219ba920103a3252e6cc026..18d95c94ad36316b7149eb5412260b40a57ac002 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
@@ -17,8 +17,6 @@ import unittest
 import logging
 import six
 
-ExecutorType = fluid.ExecutionStrategy().ExecutorType
-
 
 class TestBase(unittest.TestCase):
     def main(self,
@@ -26,7 +24,7 @@ class TestBase(unittest.TestCase):
              iter=10,
              iter_per_pe=10,
              use_gpu=True,
-             exec_type=ExecutorType.Default):
+             use_experimental_executor=False):
         if use_gpu and not fluid.core.is_compiled_with_cuda():
             logging.warning(
                 "Paddle is not compiled with CUDA, skip GPU unittests")
@@ -45,7 +43,7 @@ class TestBase(unittest.TestCase):
         for _ in six.moves.xrange(iter):
             exe_strategy = fluid.ExecutionStrategy()
             exe_strategy._dry_run = True
-            exe_strategy.executor_type = exec_type
+            exe_strategy.use_experimental_executor = use_experimental_executor
             pe = fluid.ParallelExecutor(
                 use_cuda=use_gpu,
                 loss_name=loss.name,
@@ -58,11 +56,11 @@ class TestBase(unittest.TestCase):
 class TestMNISTDryRun(TestBase):
     def test_mnist_dry_run(self):
         for use_gpu in (False, True):
-            for exec_type in (ExecutorType.Default, ExecutorType.Experimental):
+            for use_experimental_executor in (False, True):
                 self.main(
                     network_func=TestMNISTDryRun.network_func,
                     use_gpu=use_gpu,
-                    exec_type=exec_type)
+                    use_experimental_executor=use_experimental_executor)
 
     @staticmethod
     def network_func():
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index c8ac6a90c1b795c0f353c77b28ad3abf712a4b2b..7d2349fad4c84923589708f4c70848f535fd61db 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -79,26 +79,25 @@ class TestMNIST(TestParallelExecutorBase):
             return
 
         img, label = self._init_data()
-        """
+
         all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
             model,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
             use_reduce=False)
-        """
+
         reduce_first_loss, reduce_last_loss = self.check_network_convergence(
             model,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
             use_reduce=True)
-        """
+
         for loss in zip(all_reduce_first_loss, reduce_first_loss):
             self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
         for loss in zip(all_reduce_last_loss, reduce_last_loss):
             self.assertAlmostEqual(loss[0], loss[1], delta=1e-4)
-        """
 
     # simple_fc
     def check_simple_fc_convergence(self,
@@ -118,7 +117,7 @@ class TestMNIST(TestParallelExecutorBase):
             use_reduce=use_reduce,
             use_parallel_graph=use_parallel_graph)
 
-    def notest_simple_fc(self):
+    def test_simple_fc(self):
         # use_cuda
         if core.is_compiled_with_cuda():
             self.check_simple_fc_convergence(True)
@@ -126,7 +125,7 @@ class TestMNIST(TestParallelExecutorBase):
                 True, use_reduce=False, use_parallel_graph=True)
         self.check_simple_fc_convergence(False)
 
-    def notest_simple_fc_with_new_strategy(self):
+    def test_simple_fc_with_new_strategy(self):
         # use_cuda, use_reduce
         self._compare_reduce_and_allreduce(simple_fc_net, True)
         self._compare_reduce_and_allreduce(simple_fc_net, False)
@@ -163,7 +162,7 @@ class TestMNIST(TestParallelExecutorBase):
         self.assertAlmostEquals(
             np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
 
-    def notest_simple_fc_parallel_accuracy(self):
+    def test_simple_fc_parallel_accuracy(self):
         if core.is_compiled_with_cuda():
             self.check_simple_fc_parallel_accuracy(True)
             self.check_simple_fc_parallel_accuracy(
@@ -192,7 +191,9 @@ class TestMNIST(TestParallelExecutorBase):
         for use_cuda in (False, True):
             for use_fast_executor in (False, True):
                 self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor)
-        self.check_batchnorm_fc_convergence(use_cuda, False, True)
+
+        self.check_batchnorm_fc_convergence(
+            use_cuda=True, use_fast_executor=False, use_parallel_graph=True)
 
     def test_batchnorm_fc_with_new_strategy(self):
         # FIXME(zcd): close this test temporally.