diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 4253300788462a3704076fc79241a864f2f130a0..6fb17470a3b08ebfc4ac8cb0022cfba037747578 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 #include "paddle/fluid/operators/detail/simple_block_queue.h"
@@ -89,6 +90,7 @@ class ListenAndServOp : public framework::OperatorBase {
 
     auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
     auto *program = block->Program();
+    int num_blocks = program->Size();
     framework::Executor executor(dev_place);
 
     // TODO(typhoonzero): change this to a while_op for every cluster-batch.
@@ -132,12 +134,33 @@ class ListenAndServOp : public framework::OperatorBase {
         rpc_service_->ShutDown();
         break;
       }
-      try {
-        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
-                     false /*create_local_scope*/, false /*create_vars*/);
-      } catch (std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
+
+      // put optimize blocks in the thread pool to start run, the last block
+      // should be global ops.
+      // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
+      // and this will still work.
+      std::vector<std::future<void>> fs;
+      for (int blkid = 0; blkid < num_blocks - 1; ++blkid) {
+        fs.push_back(framework::Async([&]() {
+          try {
+            executor.Run(*program, &recv_scope, blkid,
+                         false /*create_local_scope*/, false /*create_vars*/);
+          } catch (std::exception &e) {
+            LOG(ERROR) << "run sub program error " << e.what();
+          }
+        }));
       }
+      for (int blkid = 0; blkid < num_blocks - 1; ++blkid) fs[blkid].wait();
+      // Run global block at final step
+      if (num_blocks > 2) {
+        try {
+          executor.Run(*program, &recv_scope, num_blocks - 1,
+                       false /*create_local_scope*/, false /*create_vars*/);
+        } catch (std::exception &e) {
+          LOG(ERROR) << "run sub program error " << e.what();
+        }
+      }
+
       // Reset the received sparse variables, the sum operator would not
       // sum the input sparse variables which rows is empty at the next
       // mini-batch.
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 3d3a6c116eeb39fb7236d0e9707415cdd6b828bd..cdde3512960686b16a0add2030ee79a8f7ba1488 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -307,15 +307,58 @@ class DistributeTranspiler:
         # Iterate through the ops, and if an op and the optimize ops
         # which located on current pserver are in one set, then 
         # append it into the sub program.
-        for _, op in enumerate(self.optimize_ops):
-            for _, opt_op in enumerate(opt_op_on_pserver):
-                if ufind.is_connected(op, opt_op):
-                    if self._is_opt_op(op):
-                        self._append_pserver_ops(optimize_block, op, endpoint,
-                                                 default_main_program())
-                    else:
-                        self._append_pserver_non_opt_ops(optimize_block, op)
-                    break
+
+        # We try to put optimization program run parallelly, assume
+        # optimization program always looks like:
+        #
+        # prevop -> prevop -> opt op -> following op -> following op; ->
+        # prevop -> prevop -> opt op -> following op -> following op; ->
+        # global op -> global op
+        #
+        # we put operators that can run parallelly to many program blocks.
+        # in above example, we seperate ops by the ";". Global ops must run
+        # after all the optimize ops finished.
+
+        global_ops = []
+        # HACK: optimization global ops only used to scale beta1 and beta2
+        # replace it with dependency engine.
+        for op in self.optimize_ops:
+            if op.type == "scale":
+                for in_name in op.input_arg_names:
+                    if in_name.startswith("beta1_pow_acc") or\
+                        in_name.startswith("beta2_pow_acc"):
+                        global_ops.append(op)
+        print("##### global ops ", global_ops)
+
+        def __append_optimize_op__(op, block):
+            if self._is_opt_op(op):
+                self._append_pserver_ops(block, op, endpoint,
+                                         default_main_program())
+            else:
+                self._append_pserver_non_opt_ops(block, op)
+
+        # append op to the current block
+        per_opt_block = optimize_block
+        for _, opt_op in enumerate(opt_op_on_pserver):
+            for _, op in enumerate(self.optimize_ops):
+                # optimizer is connected to itself
+                if ufind.is_connected(op, opt_op) and \
+                    op not in global_ops:
+                    __append_optimize_op__(op, per_opt_block)
+            per_opt_block = pserver_program.create_block(0)
+
+        # append global ops
+        for glb_op in global_ops:
+            __append_optimize_op__(glb_op, per_opt_block)
+
+        # NOT USED: single block version:
+        #
+        # for _, op in enumerate(self.optimize_ops):
+        #     for _, opt_op in enumerate(opt_op_on_pserver):
+        #         if ufind.is_connected(op, opt_op):
+        #             __append_optimize_op__(glb_op, optimize_block)
+        #             break
+
         # step5 append the listen_and_serv op
         pserver_program.global_block().append_op(
             type="listen_and_serv",
@@ -660,10 +703,22 @@ class DistributeTranspiler:
         # If one op's input is another op's output or
         # one op's output is another op's input, we say
         # the two operator is connected.
-        op1_input_names = op1.desc.input_arg_names()
+        def _append_inname_remove_beta(varname_list):
+            op_input_names = []
+            for in_name in varname_list:
+                # HACK: remove beta1 and beta2 to avoid let all
+                # ops connected.
+                if in_name.startswith("beta2_pow_acc") or \
+                    in_name.startswith("beta1_pow_acc"):
+                    continue
+                else:
+                    op_input_names.append(in_name)
+            return op_input_names
+
+        op1_input_names = _append_inname_remove_beta(op1.desc.input_arg_names())
         op1_output_names = op1.desc.output_arg_names()
 
-        op2_input_names = op2.desc.input_arg_names()
+        op2_input_names = _append_inname_remove_beta(op2.desc.input_arg_names())
         op2_output_names = op2.desc.output_arg_names()
 
         if set(op1_output_names) & set(op2_input_names) or \