diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 450fe1508f2a505a233b3d300cb7c500894231e7..5c8776b62fe3a2640016412259721f812ff0579b 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -193,8 +193,7 @@ ParallelExecutor::ParallelExecutor(
     const std::unordered_set<std::string> &bcast_vars,
     const ProgramDesc &main_program, const std::string &loss_var_name,
     Scope *scope, const std::vector<Scope *> &local_scopes,
-    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
-    size_t num_trainers, size_t trainer_id)
+    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
   member_->use_cuda_ = exec_strategy.use_cuda_;
@@ -253,7 +252,8 @@ ParallelExecutor::ParallelExecutor(
     }
 
     member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
-        member_->places_, nccl_id, num_trainers, trainer_id));
+        member_->places_, nccl_id, build_strategy.num_trainers_,
+        build_strategy.trainer_id_));
 #else
     PADDLE_THROW("Not compiled with CUDA");
 #endif
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 49d3f0d3f6f2a8965d39b656071d86bde42bfd93..121bbd55ad575477424a2fb12baab82585eae517 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -50,8 +50,7 @@ class ParallelExecutor {
                             const std::string &loss_var_name, Scope *scope,
                             const std::vector<Scope *> &local_scopes,
                             const ExecutionStrategy &exec_strategy,
-                            const BuildStrategy &build_strategy,
-                            size_t num_trainers = 1, size_t trainer_id = 0);
+                            const BuildStrategy &build_strategy);
 
   ~ParallelExecutor();
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3b81d59ad965b7532ca729682e7aeb8eb96194a8..2d817bcb0ddd6e91b6641075fd33ff36fdc3af94 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1022,8 +1022,7 @@ All parameter, weight, gradient are variables in Paddle.
   pe.def(py::init<const std::vector<platform::Place> &,
                   const std::unordered_set<std::string> &, const ProgramDesc &,
                   const std::string &, Scope *, std::vector<Scope *> &,
-                  const ExecutionStrategy &, const BuildStrategy &, size_t,
-                  size_t>())
+                  const ExecutionStrategy &, const BuildStrategy &>())
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
       // of vec<Scope*> will be freed by Python GC. We can only return Scope*
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..63331f5708a5f3b8ab72533ece38e9a75584fb2d
--- /dev/null
+++ b/python/paddle/fluid/compiler.py
@@ -0,0 +1,118 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import multiprocessing
+import os
+import six
+from .. import compat as cpt
+
+from . import core
+
+ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
+BuildStrategy = core.ParallelExecutor.BuildStrategy
+
+
+def _place_obj(place):
+    p = core.Place()
+    p.set_place(place)
+    return p
+
+
+class _ProgramCompiler(object):
+    def __init__(self, program):
+        self._program = program
+        self._compiled = False
+        self._is_data_parallel = False
+
+    def _with_data_parallel(self,
+                            loss_name=None,
+                            build_strategy=None,
+                            exec_strategy=None):
+        assert not self._is_data_parallel, "Already compiled with parallel."
+        self._is_data_parallel = True
+        self._build_strategy = build_strategy
+        self._exec_strategy = exec_strategy
+        self._loss_name = loss_name
+        return self
+
+    def _compile_data_parallel(self):
+        self._places = []
+        self._local_scopes = []
+
+        if self._exec_strategy is None:
+            self._exec_strategy = ExecutionStrategy()
+        if self._build_strategy is None:
+            self._build_strategy = BuildStrategy()
+
+        self._exec_strategy.use_cuda = isinstance(self._place, core.CUDAPlace)
+        if self._exec_strategy.use_cuda:
+            gpus_env = os.getenv("FLAGS_selected_gpus")
+            if gpus_env:
+                gpus = [int(s) for s in gpus_env.split(",")]
+            else:
+                gpus = [
+                    i for i in six.moves.range(core.get_cuda_device_count())
+                ]
+            self._places = [core.CUDAPlace(i) for i in gpus]
+        else:
+            cpu_num = int(
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
+        assert self._places, "no place for execution"
+
+        if self._exec_strategy.num_threads == 0:
+            if self._exec_strategy.use_cuda:
+                # Experiments on se-resnext shows that too many threads hurt
+                # performance. Worth tunning for other models in the future.
+                self._exec_strategy.num_threads = len(self._places) * 4
+            else:
+                cpu_num = int(
+                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+                self._exec_strategy.num_threads = cpu_num * 2
+
+        trainers_endpoints = self._program._trainers_endpoints
+        if self._build_strategy.num_trainers > 1 and trainers_endpoints:
+            assert self._build_strategy.num_trainers == len(
+                trainers_endpoints), "num_trainers == len(end_points)"
+            self._build_strategy.trainers_endpoints = trainers_endpoints
+
+        self._persistable_vars = set([
+            cpt.to_text(v.name)
+            for v in [
+                var for var in self._program.list_vars()
+                if var.persistable and var.type != core.VarDesc.VarType.RAW
+            ]
+        ])
+
+        places = list(map(_place_obj, self._places))
+        return core.ParallelExecutor(
+            places, self._persistable_vars, self._program.desc,
+            cpt.to_text(self._loss_name)
+            if self._loss_name else six.u(''), self._scope, self._local_scopes,
+            self._exec_strategy, self._build_strategy)
+
+    def _compile(self, scope, place):
+        if self._compiled:
+            return self
+        self._compiled = True
+
+        self._scope = scope
+        self._place = place
+
+        if self._is_data_parallel:
+            self._executor = self._compile_data_parallel()
+        else:
+            p = _place_obj(self._place)
+            self._executor = core.Executor(p)
+        return self
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 5a9e908b61eeeea3fdfdfcc54d1f150f59a3973b..ee7df740075dbebab6721d5c0849005c7e315572 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -14,11 +14,15 @@
 
 from __future__ import print_function
 
+import os
+import multiprocessing
 import numpy as np
 import contextlib
 import six
 from .framework import Program, default_main_program, Variable
 from . import core
+from . import compiler
+from .. import compat as cpt
 
 __all__ = ['Executor', 'global_scope', 'scope_guard']
 
@@ -275,11 +279,8 @@ class Executor(object):
 
     def __init__(self, place):
         self.place = place
-        p = core.Place()
-        p.set_place(place)
-        self.executor = core.Executor(p)
-
         self.program_caches = dict()
+        self.executor = None
         self._closed = False
 
     def _get_program_cache(self, program_cache_key):
@@ -361,6 +362,7 @@ class Executor(object):
         You can no long use this executor after calling this method.
         For the distributed training, this method would free the resource on PServers related to
         the current Trainer.
+        TODO(panyx0718): Why ParallelExecutor doesn't have close?
 
         Example:
             >>> cpu = core.CPUPlace()
@@ -368,10 +370,58 @@ class Executor(object):
             >>> ...
             >>> exe.close()
         """
-        if not self._closed:
+        if not self._closed and self.executor:
             self.executor.close()
             self._closed = True
 
+    def _run_parallel(self,
+                      exe,
+                      scope,
+                      feed=None,
+                      fetch_list=None,
+                      return_numpy=True):
+        if isinstance(feed, dict):
+            feed_tensor_dict = dict()
+            for feed_name in feed:
+                feed_tensor = feed[feed_name]
+                if not isinstance(feed_tensor, core.LoDTensor):
+                    feed_tensor = core.LoDTensor()
+                    # always set to CPU place, since the tensor need to be splitted
+                    # it is fast in CPU
+                    feed_tensor.set(feed[feed_name], core.CPUPlace())
+                feed_tensor_dict[feed_name] = feed_tensor
+
+            exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict)
+        elif isinstance(feed, list) or isinstance(feed, tuple):
+            if len(feed) != len(self._places):
+                raise ValueError(
+                    "Feed a list of tensor, the list should be the same size as places"
+                )
+
+            res = list()
+            for i, each in enumerate(feed):
+                if not isinstance(each, dict):
+                    raise TypeError(
+                        "Each element of feed list should be a dict")
+                res_dict = dict()
+                for feed_name in each:
+                    tensor = each[feed_name]
+                    if not isinstance(tensor, core.LoDTensor):
+                        tmp = core.LoDTensor()
+                        tmp.set(tensor, self._places[i])
+                        tensor = tmp
+                    res_dict[feed_name] = tensor
+                res.append(res_dict)
+            exe.feed_tensors_into_local_scopes(res)
+
+        fetch_var_name = '@FETCHED_VAR_NAME@'
+        exe.run(fetch_list, fetch_var_name)
+        arr = scope.find_var(fetch_var_name).get_lod_tensor_array()
+
+        if return_numpy:
+            return as_numpy(arr)
+        return [arr[i] for i in range(len(arr))]
+
     def run(self,
             program=None,
             feed=None,
@@ -428,6 +478,47 @@ class Executor(object):
         if self._closed:
             raise RuntimeError("Attempted to use a closed Executor")
 
+        if scope is None:
+            scope = global_scope()
+
+        compiled = isinstance(program, compiler._ProgramCompiler)
+        if not compiled:
+            p = core.Place()
+            p.set_place(self.place)
+            self.executor = core.Executor(p)
+            return self._run(
+                program,
+                feed=feed,
+                fetch_list=fetch_list,
+                feed_var_name=feed_var_name,
+                fetch_var_name=fetch_var_name,
+                scope=scope,
+                return_numpy=return_numpy,
+                use_program_cache=use_program_cache)
+
+        program._compile(scope, self.place)
+        self.executor = program._executor
+        if program._is_data_parallel:
+            return self._run_parallel(
+                exe=program._executor,
+                scope=scope,
+                feed=feed,
+                fetch_list=fetch_list,
+                return_numpy=return_numpy)
+        else:
+            return self._run(
+                program._program,
+                feed=feed,
+                fetch_list=fetch_list,
+                feed_var_name=feed_var_name,
+                fetch_var_name=fetch_var_name,
+                scope=scope,
+                return_numpy=return_numpy,
+                use_program_cache=use_program_cache)
+
+    def _run(self, program, feed, fetch_list, feed_var_name, fetch_var_name,
+             scope, return_numpy, use_program_cache):
+
         if feed is None:
             feed = {}
         if not isinstance(feed, dict):
@@ -444,9 +535,6 @@ class Executor(object):
                 "Executor requires Program as its Parameter. But you passed in %s"
                 % (type(program)))
 
-        if scope is None:
-            scope = global_scope()
-
         cache_key = _get_program_cache_key(feed, fetch_list)
         if use_program_cache:
             cached_program = self._get_program_cache(cache_key)
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index c97a93ec36d4f4a7ff6a9f097551e2d21022d5b1..917db02bb852df5c540279e5a5bbeaa7b701b09d 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -167,9 +167,8 @@ class ParallelExecutor(object):
         # step7: init ParallelExecutor
         self.executor = core.ParallelExecutor(
             places, persistable_vars, main.desc,
-            cpt.to_text(loss_name)
-            if loss_name else six.u(''), scope, local_scopes, exec_strategy,
-            build_strategy, num_trainers, trainer_id)
+            cpt.to_text(loss_name) if loss_name else six.u(''), scope,
+            local_scopes, exec_strategy, build_strategy)
 
         self.scope = scope
 
@@ -292,3 +291,6 @@ class ParallelExecutor(object):
     @property
     def device_count(self):
         return len(self._places)
+
+    def close(self):
+        pass
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 2b0ab0cc3bc23bab140d2b7e8cb765e537ff3f5c..2038b57a6c20433a2d7c02eb21676cb6dce00eab 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -19,6 +19,7 @@ import os
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid import compiler
 import time
 import numpy as np
 import math
@@ -44,15 +45,8 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   optimizer=fluid.optimizer.Adam,
                                   use_fast_executor=False,
                                   enable_sequential_execution=False):
-        def run_executor(exe, feed, fetch_list, program=None):
-            if isinstance(exe, fluid.ParallelExecutor):
-                res = exe.run(fetch_list=fetch_list, feed=feed)
-            elif isinstance(exe, fluid.Executor):
-                if program is None:
-                    program = fluid.default_main_program()
-                res = exe.run(program=program, feed=feed, fetch_list=fetch_list)
-            else:
-                raise ValueError('Unkown type exe')
+        def run_executor(exe, binary, feed, fetch_list):
+            res = exe.run(binary, feed=feed, fetch_list=fetch_list)
             return res
 
         main = fluid.Program()
@@ -72,8 +66,8 @@ class TestParallelExecutorBase(unittest.TestCase):
                 fluid.memory_optimize(main)
 
             place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            startup_exe = fluid.Executor(place)
-            startup_exe.run(startup)
+            exe = fluid.Executor(place)
+            exe.run(startup)
             exec_strategy = fluid.ExecutionStrategy()
             exec_strategy.allow_op_delay = allow_op_delay
             if use_fast_executor:
@@ -86,15 +80,13 @@ class TestParallelExecutorBase(unittest.TestCase):
             build_strategy.enable_sequential_execution = enable_sequential_execution
             if use_cuda and core.is_compiled_with_cuda():
                 build_strategy.remove_unnecessary_lock = True
-
             if use_parallel_executor:
-                exe = fluid.ParallelExecutor(
-                    use_cuda,
+                binary = compiler._ProgramCompiler(main)._with_data_parallel(
                     loss_name=loss.name,
-                    exec_strategy=exec_strategy,
-                    build_strategy=build_strategy)
+                    build_strategy=build_strategy,
+                    exec_strategy=exec_strategy)
             else:
-                exe = fluid.Executor(place=place)
+                binary = compiler._ProgramCompiler(main)
 
             if batch_size is not None:
                 batch_size *= fluid.core.get_cuda_device_count(
@@ -102,13 +94,14 @@ class TestParallelExecutorBase(unittest.TestCase):
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
             begin = time.time()
             first_loss, = run_executor(
-                exe=exe, feed=feed_dict, fetch_list=[loss.name])
+                exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
 
             for i in range(iter):
-                run_executor(exe=exe, feed=feed_dict, fetch_list=[])
+                run_executor(
+                    exe=exe, binary=binary, feed=feed_dict, fetch_list=[])
 
             last_loss, = run_executor(
-                exe=exe, feed=feed_dict, fetch_list=[loss.name])
+                exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
             end = time.time()
 
             if batch_size is not None:
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 0caab08f0dc19efceadd723b474c10e1a2deb449..5cc5d9f3d3e98b9ab6117a563e70bff27e150c30 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -26,6 +26,7 @@ import pickle
 import numpy as np
 
 import paddle.fluid as fluid
+from paddle.fluid import compiler
 
 RUN_STEP = 10
 DEFAULT_BATCH_SIZE = 2
@@ -104,8 +105,8 @@ class TestDistRunnerBase(object):
         else:
             place = fluid.CPUPlace()
 
-        startup_exe = fluid.Executor(place)
-        startup_exe.run(fluid.default_startup_program())
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
 
         strategy = fluid.ExecutionStrategy()
         strategy.num_threads = 1
@@ -125,19 +126,16 @@ class TestDistRunnerBase(object):
             mypass.set_int("num_repeats", args.batch_merge_repeat)
 
         if args.update_method == "nccl2":
-            num_trainers = len(args.endpoints.split(","))
-            trainer_id = args.trainer_id
+            build_stra.num_trainers = len(args.endpoints.split(","))
+            build_stra.trainer_id = args.trainer_id
         else:
-            num_trainers = 1
-            trainer_id = 0
+            build_stra.num_trainers = 1
+            build_stra.trainer_id = 0
 
-        exe = fluid.ParallelExecutor(
-            args.use_cuda,
+        binary = compiler._ProgramCompiler(trainer_prog)._with_data_parallel(
             loss_name=avg_cost.name,
-            exec_strategy=strategy,
             build_strategy=build_stra,
-            num_trainers=num_trainers,
-            trainer_id=trainer_id)
+            exec_strategy=strategy)
 
         feed_var_list = [
             var for var in trainer_prog.global_block().vars.values()
@@ -160,7 +158,8 @@ class TestDistRunnerBase(object):
 
         out_losses = []
         for _ in six.moves.xrange(RUN_STEP):
-            loss, = exe.run(fetch_list=[avg_cost.name],
+            loss, = exe.run(binary,
+                            fetch_list=[avg_cost.name],
                             feed=feeder.feed(get_data()))
             out_losses.append(loss[0])
         if six.PY2: