diff --git a/doc/design/images/parallel_executor_overview.dot b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
similarity index 100%
rename from doc/design/images/parallel_executor_overview.dot
rename to doc/fluid/design/concepts/images/parallel_executor_overview.dot
diff --git a/doc/design/images/parallel_executor_overview.png b/doc/fluid/design/concepts/images/parallel_executor_overview.png
similarity index 100%
rename from doc/design/images/parallel_executor_overview.png
rename to doc/fluid/design/concepts/images/parallel_executor_overview.png
diff --git a/doc/fluid/design/concepts/index_cn.rst b/doc/fluid/design/concepts/index_cn.rst
index eec8a2f14ca9e8b3bf0d0acbbb6004972790d795..dcdc894937ff328e6002623275ca3c65e87b2bb0 100644
--- a/doc/fluid/design/concepts/index_cn.rst
+++ b/doc/fluid/design/concepts/index_cn.rst
@@ -16,3 +16,4 @@
   block.md
   scope.md
   executor.md
+  parallel_executor.md
diff --git a/doc/fluid/design/concepts/index_en.rst b/doc/fluid/design/concepts/index_en.rst
index 036e1da2550cf520f5c40ecd9657f71603755adc..b85a3055746facaa642e8fc899976b58435f1ef2 100644
--- a/doc/fluid/design/concepts/index_en.rst
+++ b/doc/fluid/design/concepts/index_en.rst
@@ -16,3 +16,4 @@ Core Concepts
   block.md
   scope.md
   executor.md
+  parallel_executor.md
diff --git a/doc/design/parallel_executor.md b/doc/fluid/design/concepts/parallel_executor.md
similarity index 100%
rename from doc/design/parallel_executor.md
rename to doc/fluid/design/concepts/parallel_executor.md
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 7fbe4efc045b6539b498389af94769e5bdb1f82e..c4efbcd3f977ee285e13223d7e0ca420aec63b98 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
diff --git a/paddle/fluid/operators/adagrad_op.cc b/paddle/fluid/operators/adagrad_op.cc
index c990fe784380bf78a7f3594c0f49ef5e06e6caea..0153e1253b00ded21a7a14e37faf5a76d904d8d1 100644
--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/adagrad_op.h"
+#include <vector>
 
 #include <cmath>
 
diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h
index dbcc7abb0996268b5a3571ba113d9cc56f6f65a3..4309f0a5497456065e5c43bc8f7b265fa711f699 100644
--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc
index e8123cb1a490be642d1061bba8129f63e681d3c3..993610fdedde4bafd99f59a0adeeeef4526eb089 100644
--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/assign_value_op.h"
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
index c7b1a55a5cd52bd2bacbdea3ee22c75c2a2c12d5..e749d6f6d3685f207f0ad4f2ebc7c3c7ae32992c 100644
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
index 71de78b1181daf4bd0b6d73508638857bafcf560..a168eaeab56128b75bbe97d7ccf843a081b5dced 100644
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/auc_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h
index f4e8208c3f2e238a4acecab4579fc955092d5978..8b016c3d31ad83e66baeb298c61840cc529efa1e 100644
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -40,7 +42,7 @@ class AucKernel : public framework::OpKernel<T> {
     std::vector<float> thresholds_list;
     thresholds_list.reserve(num_thresholds);
     for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = (float)i / (num_thresholds - 1);
+      thresholds_list[i] = static_cast<float>(i) / (num_thresholds - 1);
     }
     const float kEpsilon = 1e-7;
     thresholds_list[0] = 0.0f - kEpsilon;
@@ -105,11 +107,12 @@ class AucKernel : public framework::OpKernel<T> {
     float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
     float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
     for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] =
-          ((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon);
-      fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon);
-      rec_rate_data[i] =
-          ((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon);
+      tp_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+                        (tp_data[i] + fn_data[i] + epsilon);
+      fp_rate_data[i] =
+          static_cast<float>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
+      rec_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+                         (tp_data[i] + fp_data[i] + epsilon);
     }
     *auc_data = 0.0f;
     if (curve == "ROC") {
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index c95077fcbdb6b6c0da31f30b795dbe4d7d4fe6fe..b21deaf9258567c05a8816b14ac7d6462964e8ba 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -19,15 +19,15 @@ namespace operators {
 
 template <>
 void GetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t& num_updates_,
-    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t* num_updates_,
+    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
   auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
   auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
 
-  old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
-  num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
-  num_updates_ = in_num_updates->data<int64_t>()[0];
+  *old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
+  *num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
+  *num_updates_ = in_num_updates->data<int64_t>()[0];
 }
 
 template <>
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
index 270c46984465e5ca62eaa8da3955ce7a3eaa0c57..046f72b471fa7ffcc82d84262a668c90a7f577a8 100644
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -19,18 +19,18 @@ namespace paddle {
 namespace operators {
 template <>
 void GetAccumulators<paddle::platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t& num_updates_,
-    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t* num_updates_,
+    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
   auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
   auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
   auto stream = ctx.cuda_device_context().stream();
-  memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
+  memory::Copy(platform::CPUPlace(), old_num_accumulates_,
                platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
                sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
+  memory::Copy(platform::CPUPlace(), num_accumulates_, platform::CUDAPlace(),
                in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
+  memory::Copy(platform::CPUPlace(), num_updates_, platform::CUDAPlace(),
                in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
 }
 
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index f858109d1428dc67d94c253e5a39818eb2d4560d..07ac5ced11605f6d0d5164d1c0f69acbd7bbed60 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -29,8 +29,8 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename DeviceContext>
 void GetAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t& num_updates, int64_t& num_accumulates,
-                     int64_t& old_num_accumulates);
+                     int64_t* num_updates, int64_t* num_accumulates,
+                     int64_t* old_num_accumulates);
 
 template <typename DeviceContext>
 void SetAccumulators(const framework::ExecutionContext& ctx,
@@ -47,8 +47,8 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     int64_t num_updates = 0;
     int64_t num_accumulates = 0;
     int64_t old_num_accumulates = 0;
-    GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
-                                   old_num_accumulates);
+    GetAccumulators<DeviceContext>(ctx, &num_updates, &num_accumulates,
+                                   &old_num_accumulates);
 
     // Get attrs
     float average_window = ctx.Attr<float>("average_window");
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
index f1c4415f27d54ad09e5cb3659bd16abd82e38215..8c55b4ebbc88f696e99b1194055bed3b0d0b3f0b 100644
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/spp_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index 3d2f22632570fe2a28a822370a400390c78b533a..08cb7849d20443862b66ea6096c095b294c7242c 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index d3d5c8a3429e2070c5472355b4440401eaa699cb..9061e137bd1c789d34665729c48c1c2ea9525c8e 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sum_op.h"
+#include <algorithm>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index e7e5346cdca5efaf81c2b0fddedde7406e3b874d..49a4afb3a8a19c97e844e66477c6288772ece807 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 42828b7e6564d7da91d608d63fbc0615ef6c4f97..9f8482adedb4c29e32d4109941a2752d942ae49f 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <iostream>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 87b1f530e08df7022d112b26e28511a982052126..4aea9cd65bed615c84c95d891a0a4092678e1444 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/transpose_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index 90f16499a6f52514bfed3dbeb4176ccc956b23d7..895d1ce2cca19c0c1e4aa03cc64eb1425e8bab1a 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 0ca7ea00fafc5cf7ab240e1e41710d3b791dfbfb..31859fd1d70dc6e6387258cd5f7412e78a302567 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unpool_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h
index a4421045756bd39728fc14c06efd11a56c7e55af..96abad3de9b959ee611355c67f1fa9e56c430b1b 100644
--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/unpooling.h"
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index 3e3e3089315ab9365925c38b9bce5fb0120d37c3..afbfe69973830bde93ec0af8d1c844580a786663 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 7a2a81be9f269f262160cd082ec3a1d8e8e46811..3c6be913200716ae4f70e2b48ee8faf8078223d2 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -102,6 +102,8 @@ def split_dense_variable(var_list,
         the parameter server side can gain better performance. By default
         minimum block size is 1024. The max block size is used to prevent
         very large blocks that may cause send error.
+        :return: A list of VarBlocks. Each VarBlock specifies a shard of
+           the var.
     """
     blocks = []
     for var in var_list:
@@ -192,22 +194,24 @@ class DistributeTranspiler:
         self.trainer_id = trainer_id
         pserver_endpoints = pservers.split(",")
 
-        # step1
+        # step1: For large parameters and gradients, split them into smaller
+        # blocks.
         param_list = [pg[0] for pg in params_grads]
         grad_list = [pg[1] for pg in params_grads]
         grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
         param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
-        # step2
+        # step2: Create new vars for the parameters and gradients blocks and
+        # add ops to do the split.
         grad_var_mapping = self._append_split_op(program, grad_blocks)
-        # step3
+        param_var_mapping = self._create_vars_from_blocklist(program,
+                                                             param_blocks)
+        # step3: Add gradients as send op inputs and parameters as send
+        # op outputs.
         send_inputs = []
         send_outputs = []
         for b in grad_blocks:  # append by order
             varname, block_id, _ = b.split(":")
             send_inputs.append(grad_var_mapping[varname][int(block_id)])
-
-        param_var_mapping = self._create_vars_from_blocklist(program,
-                                                             param_blocks)
         for b in param_blocks:
             varname, block_id, _ = b.split(":")
             send_outputs.append(param_var_mapping[varname][int(block_id)])
@@ -237,7 +241,7 @@ class DistributeTranspiler:
                      "RPCClient": rpc_client_var},
             attrs={"endpoints": pserver_endpoints,
                    "epmap": eplist})
-        # step4
+        # step4: Concat the parameters splits together after recv.
         for varname, splited_var in param_var_mapping.iteritems():
             if len(splited_var) <= 1:
                 continue
@@ -258,13 +262,14 @@ class DistributeTranspiler:
     def get_pserver_program(self, endpoint):
         """
         Get pserver side program using the endpoint.
+        TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
         NOTE: assume blocks of the same variable is not distributed
         on the same pserver, only change param/grad varnames for
         trainers to fetch.
         """
         # step1
         pserver_program = Program()
-        # step2
+        # step2: Create vars to receive vars at parameter servers.
         recv_inputs = []
         for v in self.param_grad_ep_mapping[endpoint]["params"]:
             self._clone_var(pserver_program.global_block(), v)
@@ -278,12 +283,6 @@ class DistributeTranspiler:
                 orig_var_name = v.name[:suff_idx]
             else:
                 orig_var_name = v.name
-            single_trainer_var = pserver_program.global_block().create_var(
-                name=orig_var_name,
-                persistable=True,
-                type=v.type,
-                dtype=v.dtype,
-                shape=v.shape)
             if self.trainers > 1:
                 for trainer_id in xrange(self.trainers):
                     var = pserver_program.global_block().create_var(
@@ -294,6 +293,12 @@ class DistributeTranspiler:
                         shape=v.shape)
                     recv_inputs.append(var)
             else:
+                single_trainer_var = pserver_program.global_block().create_var(
+                    name=orig_var_name,
+                    persistable=True,
+                    type=v.type,
+                    dtype=v.dtype,
+                    shape=v.shape)
                 recv_inputs.append(single_trainer_var)
 
         # step3
@@ -344,7 +349,7 @@ class DistributeTranspiler:
                 self._append_pserver_non_opt_ops(block, op)
 
         append_block = optimize_block
-        # append lr decay ops to the child block if exits
+        # append lr decay ops to the child block if exists
         lr_ops = self._get_lr_ops()
         if len(lr_ops) > 0:
             for _, op in enumerate(lr_ops):
@@ -447,8 +452,10 @@ class DistributeTranspiler:
                                     block_list,
                                     add_trainer_suffix=False):
         """
+        Create vars for each split.
         NOTE: only grads need to be named for different trainers, use
               add_trainer_suffix to rename the grad vars.
+        :return: A dict mapping from original var name to each var split.
         """
         block_map = dict()
         var_mapping = dict()
@@ -615,6 +622,7 @@ class DistributeTranspiler:
                         type="sum",
                         inputs={"X": vars2merge},
                         outputs={"Out": merged_var})
+                    # TODO(panyx0718): What if it's SELECTED_ROWS.
                     if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
                         optimize_block.append_op(
                             type="scale",
@@ -638,7 +646,7 @@ class DistributeTranspiler:
                     shape=param_block.shape)
                 new_inputs[key] = tmpvar
             elif key == "LearningRate":
-                # leraning rate variable has already be created by non-optimize op,
+                # learning rate variable has already be created by non-optimize op,
                 # don't create it once again.
                 lr_varname = opt_op.input(key)[0]
                 if pserver_block.vars.has_key(lr_varname):
@@ -773,6 +781,7 @@ class DistributeTranspiler:
         return False
 
     def _get_input_map_from_op(self, varmap, op):
+        """Returns a dict from op input name to the vars in varmap."""
         iomap = dict()
         for key in op.input_names:
             vars = []
@@ -785,6 +794,7 @@ class DistributeTranspiler:
         return iomap
 
     def _get_output_map_from_op(self, varmap, op):
+        """Returns a dict from op output name to the vars in varmap."""
         iomap = dict()
         for key in op.output_names:
             vars = []
@@ -812,6 +822,7 @@ class DistributeTranspiler:
                 find_ops.append(op)
         # make a union find struct by the ops in default_main_program
         ufind = UnionFind(block.ops)
+
         for op1 in block.ops:
             for op2 in block.ops:
                 # NOTE: we need to skip all optimize ops, since it is connected