diff --git a/doc/design/parallel_executor.md b/doc/design/parallel_executor.md
new file mode 100644
index 0000000000000000000000000000000000000000..567eede1bd59bfb519bdb0b96de1684cfea6c61b
--- /dev/null
+++ b/doc/design/parallel_executor.md
@@ -0,0 +1,52 @@
+# ParallelExecutor Design Doc
+
+## Introduction
+
+We introduce `ParallelExecutor` to run multi-GPU training in PaddlePaddle Fluid. It supports
+1. keeping a copy of the parameters on each GPU
+1. allreduce on a separate stream allowing computation and communication overlap
+
+An example of switching single GPU training to multiple GPUs:
+```python
+cost = your_neural_network()
+opt = fluid.optimizer.SGDOptimizer()
+opt.minimize(avg_cost)
+
+# change Executor -> ParallelExecutor
+exe = fluid.ParallelExecutor(gpu_list=[0, 1])
+
+for iter in xranges(iter_num):
+    exe.run()
+```
+
+## Design
+
+In the constructor, a list of parameter, whose gradients need to be allreduced, is given.
+
+During the runtime, `ParallelExecutor` starts `#gpu` threads to run each `Executor`. For every
+operator run on each GPU, it will automatically sync with different streams when necessary.
+
+```c++
+// if op's input is params' grad:
+    // sync with allreduce stream
+    // e.g. sgd should wait for allreduce to be finished
+SyncMultipleStreams(op);
+
+op->Run(*local_scope, place_);
+
+// if op's output is params' grad:
+//     sync with computation stream
+//     e.g. allreduce shoudl wait for fc_grad to be finished.
+SyncMultipleStreams(op);
+```
+
+
+## API
+
+The `ParallelExecutor.run` has similar interface as `Executor.run`. Besides
+1. Scope: we don't expose `scope` in `ParallelExecutor.run` since `ParallelExecutor` has its
+own scope to maintain NCCL.
+1. Feed: we don't expose `feed` in the API either, because the whole point of implementing
+parallel_executor is the speed. The input for NN should be implemented in an reader OP.
+1. Fetch: we return the fetched value on all GPUs as a list. (e.g. `exe.run(..., fetch=loss)`
+with return `[loss_on_gpu0, loss_on_gpu1]`)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 15e5574ecfd406b87db8370948352b7e736937ea..934bb43ffea456d528dddf61dbc04f760246dfe6 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -86,6 +86,8 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo
 
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
 framework_proto backward glog lod_rank_table feed_fetch_method)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope
+        framework_proto backward glog lod_rank_table feed_fetch_method executor)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 5cae38b2a857b2037f0e5ae4da50d1591da0c11a..6ee3f18dd42ef2d22c165907fdcf4c90a66425a4 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -305,10 +305,23 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }    // if (create_vars)
 
   for (auto& op : ctx->ops_) {
+    // TODO(ty):
+    // e.g. sgd should wait for allreduce to be finished
+    // if op's input is params' grad:
+    //     sync with allreduce stream
+    // SyncMultipleStreams(op);
+
     VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
     VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
 
+    // TODO(ty):
+    // e.g. allreduce shoudl wait for fc_grad to be finished.
+    // if op's output is params' grad:
+    //     sync with computation stream
+    //     apply allreduce on allreduce stream
+    // SyncMultipleStreams(op);
+
     if (FLAGS_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 28ce3315154cea45412984df4daf7385ce2cf572..8d8a7cf4db6907f98b0ad200c40400b1643766f3 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -47,6 +47,7 @@ class Executor {
            const std::string& feed_holder_name = "feed",
            const std::string& fetch_holder_name = "fetch");
 
+ private:
   static ExecutorPrepareContext* Prepare(const ProgramDesc& program,
                                          int block_id);
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9f213ae2cff61ccdd0b87fd84ac1b68706f11b0
--- /dev/null
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/parallel_executor.h"
+
+namespace paddle {
+namespace framework {}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..47e0005e58d916c15c87a2e9d3cf5f8e8c550102
--- /dev/null
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <unordered_set>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+struct AllReduceCallBack {
+  void operator()(framework::OperatorBase* op);
+
+  std::unordered_set<std::string> param_grad_names_;
+  platform::DeviceContext dev_ctx;
+};
+
+class ParallelExecutor {
+  explicit ParallelExecutor(const std::vector<platform::Place>& places,
+                            const std::unordered_set& params);
+
+  /* @Brief
+   * Runtime evaluation of the given ProgramDesc under certain Scope
+   *
+   * @param
+   *  ProgramDesc
+   *  Scope
+   */
+  void Run(const ProgramDesc& prog, Scope* scope, int block_id,
+           bool create_local_scope = true, bool create_vars = true);
+
+ private:
+  std::vector<framework::Executor> exes_;
+  std::vector<framework::Scope*> scopes_;
+  AllReduceCallBack all_reduce_callbacks_;
+  std::unordered_set<std::string> params_;  // where to initilize it?
+  platform::Communicator nccl_com_;
+};
+
+}  // namespace framework
+}  // namespace paddle