diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index daee55b7f9adfffdf709ed2b5b0d957c7ca1aea4..ec7f1446cfb74842af7d0c7152bebf58619f3861 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -198,6 +198,10 @@ identity_projection
 ..  autoclass:: paddle.v2.layer.identity_projection
     :noindex:
 
+slice_projection
+-------------------
+..  autoclass:: paddle.v2.layer.slice_projection
+    :noindex:
 
 table_projection
 ----------------
diff --git a/doc/design/scope.md b/doc/design/scope.md
index afe6bc028cafc5ee24b0041905857af58d3f5790..c9e0be716b606f6c7bf0373e0c6e632647e07a6f 100644
--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -37,8 +37,8 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 ```cpp
 class Scope {
  public:
-  Variable* CreateVariable(const std::string& name);
-  const Variable* GetVariable(const std::string& name) const;
+  Variable* NewVar(const std::string& name);
+  const Variable* FindVar(const std::string& name) const;
 
  private:
     std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
@@ -58,12 +58,12 @@ class Scope {
  public:
   Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}
 
-  Variable* GetVariable(const std::string& name) const {
+  Variable* FindVar(const std::string& name) const {
     auto it = vars_.find(name);
     if (it != vars_.end()) {
       return it->second.get();
     } else if (parent_ != nullptr) {
-      return parent_->GetVariable(name);
+      return parent_->FindVar(name);
     } else {
       return nullptr;
     }
@@ -95,10 +95,10 @@ class Scope {
   static std::shared_ptr<Scope> Create(const std::shared_ptr<Scope>& parent = nullptr);
 
   // return nullptr if not found.
-  Variable* GetVariable(const std::string& name) const;
+  Variable* FindVar(const std::string& name) const;
 
   // return if already contains same name variable.
-  Variable* CreateVariable(const std::string& name);
+  Variable* NewVar(const std::string& name);
 
  private:
   std::shared_ptr<Scope> parent_;
@@ -107,11 +107,11 @@ class Scope {
 ```
 ## Only scope can create a variable
 
-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `CreateVariable` can construct `Variable`.
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`.
 
 ## When scope destroyed, all variables inside this scope should be destroyed together
 
-The scope hold unique pointers for all variables. User can `GetVariable` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
+The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
 
 ## Sharing a parent scope
 
@@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar
 
 ## Orthogonal interface
 
-`GetVariable` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `CreateVariable` will return a `Error` when there is a name conflict locally. Combine `GetVariable` and `CreateVariable`, we can implement `CreateOrGetVariable` easily.
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return a `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
index 85cb399590f7a5e7e73285ca87c49ea5f24afb32..572a61e4ccaa9ef3d03a60d916e80eab907c6d88 100644
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -3,24 +3,11 @@ import paddle.v2.dataset.uci_housing as uci_housing
 import paddle.v2.master as master
 import os
 import cPickle as pickle
+from paddle.v2.reader.creator import cloud_reader
 
 etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
-etcd_endpoint = "http://" + etcd_ip + ":2379"
-print "connecting to master, etcd endpoints: ", etcd_endpoint
-master_client = master.client(etcd_endpoint, 5, 64)
-
-
-def cloud_reader():
-    global master_client
-    master_client.set_dataset(
-        ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*"], passes=30)
-    while 1:
-        r, e = master_client.next_record()
-        if not r:
-            if e != -2:  # other errors
-                print "get record error:", e
-            break
-        yield pickle.loads(r)
+etcd_endpoints = "http://" + etcd_ip + ":2379"
+print "etcd endpoints: ", etcd_endpoints
 
 
 def main():
@@ -49,7 +36,7 @@ def main():
                                  parameters=parameters,
                                  update_equation=optimizer,
                                  is_local=False,
-                                 pserver_spec=etcd_endpoint,
+                                 pserver_spec=etcd_endpoints,
                                  use_etcd=True)
 
     # event_handler to print training and testing info
@@ -75,7 +62,11 @@ def main():
     trainer.train(
         reader=paddle.batch(
             paddle.reader.shuffle(
-                cloud_reader, buf_size=500), batch_size=2),
+                cloud_reader(
+                    ["/pfs/dlnel/public/dataset/uci_housing/uci_housing*"],
+                    etcd_endpoints),
+                buf_size=500),
+            batch_size=2),
         feeding={'x': 0,
                  'y': 1},
         event_handler=event_handler,
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 21cb7c7265e0052630b68954fa25f9189e641e7b..12a3a00bba35d476fca9c9fb47ac20b87e6f53f2 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -8,7 +8,9 @@ cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
 cc_test(variable_test SRCS variable_test.cc)
-cc_test(scope_test SRCS scope_test.cc)
+
+cc_library(scope SRCS scope.cc)
+cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
@@ -16,7 +18,7 @@ proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
 
-cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
+cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 
 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
@@ -30,4 +32,7 @@ add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch
 add_dependencies(framework_py_proto framework_py_proto_init)
 
 cc_library(net SRCS net.cc DEPS op_registry)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op softmax_op fc_op)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net)
+
+cc_library(backward SRCS backward.cc DEPS net)
+cc_test(backward_test SRCS backward_test.cc DEPS backward)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0da11b91a7fe4a98e0832f70095c3200956ff001
--- /dev/null
+++ b/paddle/framework/backward.cc
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/backward.h"
+#include <list>
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+static bool AllInSet(const std::vector<std::string>& names,
+                     const std::string& suffix,
+                     const std::unordered_set<std::string>& set) {
+  for (auto& name : names) {
+    if (set.find(name + suffix) == set.end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static std::shared_ptr<OperatorBase> NOP() {
+  auto net_op = std::make_shared<NetOp>();
+  net_op->type_ = "@NOP@";
+  net_op->CompleteAddOp();
+  return net_op;
+}
+
+//  Get backward operator from a forward operator, recursively implementation.
+//
+//  no_grad_names the gradient variable names without gradient calculating.
+//
+//  uniq_id is a unique index used inside recursively calling BackwardRecursive.
+//  use `uid = uniq_id++;` to get the unique index, and pass `uniq_id` through
+//  recursive calling.
+//
+//  returns The backward operator. For simple situation, it is a simple
+//  operator. For complex situation, it is a NetOp.
+//
+//  See Backward.h for details
+static std::shared_ptr<OperatorBase> BackwardRecursive(
+    const OperatorBase& forwardOp,
+    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id);
+std::shared_ptr<OperatorBase> BackwardRecursive(
+    const OperatorBase& forwardOp,
+    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id) {
+  //  If all input gradients of forwarding operator do not need to calculate,
+  //  just return an NOP. Not return null ptr because NOP does not take
+  //  too much time for calculation, but it is useful for simplifying logic.
+  if (AllInSet(forwardOp.inputs_, OperatorBase::GRAD_VAR_SUFFIX(),
+               no_grad_names)) {
+    return NOP();
+  }
+
+  //  All output gradients of forwarding operator do not need to calculate. Then
+  //  all input gradients cannot be computed at all, and we put them into
+  //  `no_grad_names` set. Return an NOP.
+  if (AllInSet(forwardOp.outputs_, OperatorBase::GRAD_VAR_SUFFIX(),
+               no_grad_names)) {
+    for (auto& name : forwardOp.inputs_) {
+      // Mark all input is not need
+      no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX());
+    }
+    return NOP();
+  }
+
+  // Returned gradient network
+  auto net = std::make_shared<NetOp>();
+
+  if (forwardOp.IsNetOp()) {
+    // Because forwardOp is a net op, it can static_cast.
+    auto& forwardNet = static_cast<const NetOp&>(forwardOp);
+
+    // Map from output gradient variable name to operator's indices in backward
+    // net. That operator generates that variable.
+    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
+
+    size_t local_op_id = 0;
+    // reversely travel forwardNet
+    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
+         ++it, ++local_op_id) {
+      auto fwd = *it;
+      auto bwd = BackwardRecursive(*fwd, no_grad_names, uniq_id);
+      net->AddOp(bwd);
+      for (auto& out : bwd->outputs_) {
+        dup_output_ops[out].emplace_back(local_op_id);
+      }
+    }
+    // Get unique ID for this method.
+    auto uid = uniq_id++;
+    // TODO(dzh): more comment
+    using Pos = std::pair<size_t, std::shared_ptr<OperatorBase>>;
+    std::list<Pos> insert_position;
+    for (auto& dup_output_op : dup_output_ops) {
+      const std::string& name = dup_output_op.first;
+      auto& dup_op = dup_output_op.second;
+      if (dup_op.size() == 1) continue;
+      std::vector<std::string> dup_outputs;
+
+      for (size_t i = 0; i < dup_op.size(); ++i) {
+        auto op_offset = dup_op[i];
+        dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
+                              std::to_string(i));
+        net->ops_[op_offset]->Rename(name, dup_outputs.back());
+      }
+      insert_position.push_back(
+          {dup_op.back(),
+           OpRegistry::CreateOp(
+               "add", {dup_outputs}, {name},
+               {{"input_format",
+                 std::vector<int>{0, static_cast<int>(dup_outputs.size())}}})});
+    }
+
+    insert_position.sort(
+        [](const Pos& l, const Pos& r) { return l.first > r.first; });
+
+    for (auto& pos : insert_position) {
+      net->InsertOp(pos.first + 1, pos.second);
+    }
+
+  } else {
+    std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
+    for (std::string& grad_input : grad_op->inputs_) {
+      if (no_grad_names.count(grad_input)) {
+        std::string prefix = grad_input.substr(
+            0, grad_input.size() - OperatorBase::GRAD_VAR_SUFFIX().size());
+        grad_input = prefix + OperatorBase::ZERO_VAR_SUFFIX();
+
+        // If part of input gradient of that operator is not calculated, fill
+        // zero variables to that input gradient.
+        net->AddOp(OpRegistry::CreateOp("fill_zeros_like", {prefix},
+                                        {grad_input}, {}));
+      }
+    }
+
+    for (std::string& grad_output : grad_op->outputs_) {
+      if (no_grad_names.count(grad_output)) {
+        grad_output = OperatorBase::EMPTY_VAR_NAME();
+      }
+    }
+
+    if (net->ops_.empty()) {  // Current no aux op is added to network
+      return grad_op;
+    }
+    net->AddOp(grad_op);
+  }
+  net->type_ = "@GENERATED_BACKWARD@";
+  net->CompleteAddOp();
+  return net;
+}
+
+// See header for comments
+std::shared_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars) {
+  std::unordered_set<std::string> no_grad_names;
+  no_grad_names.reserve(no_grad_vars.size());
+
+  for (auto& name : no_grad_vars) {
+    no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX());
+  }
+  size_t uid = 0;
+  return BackwardRecursive(forwardOp, no_grad_names, uid);
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..c181919dc165cf0b49362f85e22ceb4131bbd387
--- /dev/null
+++ b/paddle/framework/backward.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <unordered_set>
+#include "operator.h"
+namespace paddle {
+namespace framework {
+
+// Create the backward operator from a forward operator.
+// TODO(yuyang18): Add more API reference comment.
+extern std::shared_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars);
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/backward.md b/paddle/framework/backward.md
new file mode 100644
index 0000000000000000000000000000000000000000..74c001b06a9e7b2279abf998604f2acf1b1168e4
--- /dev/null
+++ b/paddle/framework/backward.md
@@ -0,0 +1,38 @@
+## Operator/expression 's Backward
+
+### Motivation
+
+In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation lineage, the operator/ expression's Backward feature will generate the backward pass respect to forward pass.
+
+### Implement : gradient operator registry
+
+|                        | forward operator | backward operator                |
+| ---------------------- | ---------------- | -------------------------------- |
+| **Operator::inputs_**  | Inputs           | Inputs, Outputs, OutputGradients |
+| **Operator::outputs_** | Outputs          | InputGradients                   |
+
+Inputs/Outputs means the input/output of the operator,  InputGradients/OutputGradients is the gradient respect to forward opeartor. Forward operator and Backward operator are isomorphic, save their corresponding needs into member attribute.
+
+We use a global hash map record the gradient operators available, follow the philosophy  of minimum core, make operator pluggable unit. Each gradient is an operator and it needs to regist itself. 
+
+grad_op_builder(fengjiayi)
+
+### Implement : Backward network
+
+given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
+
+1. bla bla bla (yuyang)
+
+2. NetOp 
+
+   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively and ensure them done. During the process, we need to collect the `OutputGradients` name.
+
+   We share variable in the same scope, as a result, duplicate operator `OutputGradients` will overwirte then duplicate variable.  
+
+   ![./images/duplicate_op]()
+
+    Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator instead. 
+
+![./images/duplicate_op2]()
+
+​	Then collect the sub graph OutputGradients/InputGradients as the NetOp's and return it.
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b095c2c3d5dbf21b5ea70e17475a4aaad9b1db44
--- /dev/null
+++ b/paddle/framework/backward_test.cc
@@ -0,0 +1,389 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/backward.h"
+
+#include <gtest/gtest.h>
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+class EmptyOp : public OperatorBase {
+ public:
+  void InferShape(const Scope &scope) const override {}
+  void Run(const Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {}
+};
+
+class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input X of Add").IgnoreGradient();
+    AddInput("b", "Bias of Add").IgnoreGradient();
+    AddOutput("Out", "Out of Add").IgnoreGradient();
+    AddComment("Add Op");
+  }
+};
+
+class MulOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("A", "A");
+    AddInput("B", "B");
+    AddOutput("Out", "Out");
+    AddComment("Mul");
+  }
+};
+
+class SigmoidOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "X");
+    AddOutput("Y", "Y");
+    AddComment("Sigmoid");
+  }
+};
+
+class NoGradOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "X input");
+    AddOutput("Y", "Y output");
+    AddComment("NoGradOp, same input output. no Grad");
+  }
+};
+
+class FcOp : public NetOp {
+ public:
+  void Init() override {
+    AddOp(OpRegistry::CreateOp("mul", {Input("X"), Input("W")},
+                               {Output("mul_result")}, {}));
+    auto b_name = Input("b");
+    std::string before_act = "mul_result";
+    if (b_name != EMPTY_VAR_NAME()) {
+      AddOp(OpRegistry::CreateOp("rowwise_add", {Output("mul_result"), b_name},
+                                 {Output("add_result")}, {}));
+      before_act = "add_result";
+    } else {
+      auto out_varname = Output("add_result");
+      if (out_varname != EMPTY_VAR_NAME()) {
+        this->Rename(out_varname, EMPTY_VAR_NAME());
+      }
+    }
+
+    AddOp(OpRegistry::CreateOp("sigmoid", {Output(before_act)}, {Output("Out")},
+                               {}));
+    CompleteAddOp(false);
+  }
+};
+
+class FcOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  FcOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x");
+    AddInput("W", "w");
+    AddInput("b", "b");
+    AddOutput("mul_result", "").SetTemporary();
+    AddOutput("add_result", "").SetTemporary();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class ManyOutputOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "x");
+    AddOutput("y", "y");
+    AddOutput("z", "z");
+    AddComment("");
+  }
+};
+
+class FillZeroOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "x");
+    AddOutput("out", "out");
+    AddComment("");
+  }
+};
+
+class AddOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x").SetMultiple();
+    AddOutput("Y", "y");
+    AddComment("");
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+namespace f = paddle::framework;
+using EnforceNotMet = paddle::platform::EnforceNotMet;
+REGISTER_OP(rowwise_add, f::EmptyOp, f::RowWiseAddOpMaker);
+REGISTER_GRADIENT_OP(rowwise_add, rowwise_add_grad, f::EmptyOp);
+REGISTER_OP(mul, f::EmptyOp, f::MulOpMaker);
+REGISTER_GRADIENT_OP(mul, mul_grad, f::EmptyOp);
+REGISTER_OP(sigmoid, f::EmptyOp, f::SigmoidOpMaker);
+REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, f::EmptyOp);
+REGISTER_OP(nograd, f::EmptyOp, f::NoGradOpMaker);
+REGISTER_OP(fill_zeros_like, f::EmptyOp, f::FillZeroOpMaker);
+REGISTER_OP(add, f::EmptyOp, f::AddOpMaker);
+REGISTER_GRADIENT_OP(add, add_grad, f::EmptyOp);
+REGISTER_OP(fc, f::FcOp, f::FcOpMaker);
+REGISTER_OP(many_output_op, f::EmptyOp, f::ManyOutputOpMaker);
+REGISTER_GRADIENT_OP(many_output_op, many_output_op_grad, f::EmptyOp);
+
+TEST(Backward, simple_op_grad) {
+  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
+  ASSERT_NE(fwd, nullptr);
+  auto gop = f::OpRegistry::CreateGradOp(*fwd);
+  ASSERT_EQ(1UL, gop->inputs_.size());
+  ASSERT_EQ("Out" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->inputs_[0]);
+  ASSERT_EQ("rowwise_add_grad", gop->type_);
+  ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[0]);
+  ASSERT_EQ("b" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[1]);
+
+  ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+            gop->Output("X" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+}
+
+TEST(Backward, simple_op_not_need_grad) {
+  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
+  ASSERT_NE(fwd, nullptr);
+  auto gop = f::Backward(*fwd, {"X"});
+  ASSERT_EQ(std::find(gop->outputs_.begin(), gop->outputs_.end(),
+                      "X" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            gop->outputs_.end());
+
+  auto no_input_gop = f::Backward(*fwd, {"X", "b"});
+  ASSERT_NE(no_input_gop, nullptr);
+  ASSERT_TRUE(no_input_gop->IsNetOp());
+  ASSERT_EQ(0UL, std::static_pointer_cast<f::NetOp>(no_input_gop)->ops_.size());
+}
+
+TEST(Backward, net_fc_backward_normal) {
+  std::shared_ptr<f::OperatorBase> fwd = f::OpRegistry::CreateOp(
+      "fc", {"X", "w", "b"}, {"mul_result", "add_result", "out"}, {});
+  ASSERT_NE(fwd, nullptr);
+  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  ASSERT_TRUE(gop->IsNetOp());
+  auto net = static_cast<f::NetOp *>(gop.get());
+
+  ASSERT_NO_THROW(net->DebugString());
+
+  ASSERT_EQ(3UL, net->ops_.size());
+
+  f::OperatorBase &d_sigmoid = *net->ops_[0];
+  ASSERT_EQ("sigmoid_grad", d_sigmoid.type_);
+
+  f::OperatorBase &d_add = *net->ops_[1];
+  ASSERT_EQ("rowwise_add_grad", d_add.type_);
+
+  f::OperatorBase &d_mul = *net->ops_[2];
+  ASSERT_EQ("mul_grad", d_mul.type_);
+}
+
+TEST(Backward, net_fc_backward_not_have_b) {
+  std::shared_ptr<f::OperatorBase> fwd = f::OpRegistry::CreateOp(
+      "fc", {"X", "w", f::OperatorBase::EMPTY_VAR_NAME()},
+      {"mul_result", "add_result", "tmp"}, {});
+  ASSERT_NE(fwd, nullptr);
+  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  ASSERT_TRUE(gop->IsNetOp());
+  auto net = static_cast<f::NetOp *>(gop.get());
+
+  ASSERT_NO_THROW(net->DebugString());
+
+  ASSERT_EQ(2UL, net->ops_.size());
+
+  f::OperatorBase &d_sigmoid = *net->ops_[0];
+  ASSERT_EQ("sigmoid_grad", d_sigmoid.type_);
+
+  f::OperatorBase &d_mul = *net->ops_[1];
+  ASSERT_EQ("mul_grad", d_mul.type_);
+}
+
+TEST(Backward, net_input_of_network_not_need_grad) {
+  f::NetOp net;
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"X", "W1", "b1"},
+                                    {"mul_tmp_0", "add_tmp_0", "hidden0"}, {}));
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"hidden0", "W2", "b2"},
+                                    {"mul_tmp_1", "add_tmp_1", "hidden1"}, {}));
+  net.CompleteAddOp();
+  auto bwd = Backward(net, {"X"});  // X@GRAD is not need.
+  ASSERT_TRUE(bwd->IsNetOp());
+  auto bwd_net = static_cast<f::NetOp *>(bwd.get());
+
+  std::unordered_set<std::string> all_output = std::unordered_set<std::string>(
+      bwd_net->outputs_.begin(), bwd_net->outputs_.end());
+  all_output.erase(f::OperatorBase::EMPTY_VAR_NAME());
+
+  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
+    ASSERT_NE(all_output.find(out + f::OperatorBase::GRAD_VAR_SUFFIX()),
+              all_output.end());
+  }
+
+  // Not Generated X
+  ASSERT_EQ(all_output.find("X" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            all_output.end());
+
+  ASSERT_EQ(2UL, bwd_net->ops_.size());
+  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
+  auto first_fc_grad = static_cast<f::NetOp *>(bwd_net->ops_[1].get());
+  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
+  ASSERT_EQ(
+      f::OperatorBase::EMPTY_VAR_NAME(),
+      first_fc_grad->ops_[2]->Output("A" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+}
+
+TEST(Backward, net_shared_weight) {
+  f::NetOp net;
+  net.AddOp(f::OpRegistry::CreateOp("mul", {"X", "W"}, {"Out"}, {}));
+  net.AddOp(f::OpRegistry::CreateOp("mul", {"Out", "W"}, {"FinalOut"}, {}));
+  net.CompleteAddOp();
+
+  auto bwd = f::Backward(net, {});
+  ASSERT_TRUE(bwd->IsNetOp());
+  auto bwd_net = static_cast<f::NetOp *>(bwd.get());
+  ASSERT_EQ(3UL, bwd_net->ops_.size());
+  ASSERT_EQ("add", bwd_net->ops_[2]->type_);
+}
+
+TEST(Backward, op_register_grad_not_for_network) {
+  auto fwd = f::OpRegistry::CreateOp(
+      "fc", {"X", "W", "b"}, {"mul_out", "add_out", "out1"},
+      {{"temporary_index", std::vector<int>{0, 1}}});
+
+  ASSERT_THROW(f::OpRegistry::CreateGradOp(*fwd), EnforceNotMet);
+}
+
+TEST(Backward, op_all_input_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
+  auto backward = f::Backward(*fwd, {"X", "b"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<f::NetOp *>(backward.get());
+  ASSERT_TRUE(net->ops_.empty());
+}
+
+TEST(Backward, op_all_output_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
+  auto backward = f::Backward(*fwd, {"Out"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<f::NetOp *>(backward.get());
+  ASSERT_TRUE(net->ops_.empty());
+}
+
+TEST(Backward, op_part_of_output_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("many_output_op", {"X"}, {"Y", "Z"}, {});
+  auto backward = f::Backward(*fwd, {"Z"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<f::NetOp *>(backward.get());
+  ASSERT_EQ(net->ops_.size(), 2UL);
+
+  auto &fill_zero = *net->ops_[0];
+  ASSERT_EQ("fill_zeros_like", fill_zero.type_);
+  ASSERT_EQ(1UL, fill_zero.inputs_.size());
+  ASSERT_EQ("Z", fill_zero.inputs_[0]);
+  ASSERT_EQ(1UL, fill_zero.outputs_.size());
+  ASSERT_EQ("Z" + f::OperatorBase::ZERO_VAR_SUFFIX(), fill_zero.outputs_[0]);
+
+  auto &d_many_out = *net->ops_[1];
+  ASSERT_EQ("many_output_op_grad", d_many_out.type_);
+  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.inputs_.size());  // I/O/OG
+  ASSERT_EQ("Z" + f::OperatorBase::ZERO_VAR_SUFFIX(),
+            d_many_out.Input("z" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+  ASSERT_EQ("Y" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+            d_many_out.Input("y" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+  ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+            d_many_out.Output("x" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+}
+
+TEST(Backward, op_part_of_input_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("mul", {"a", "b"}, {"out"}, {});
+  auto backward = f::Backward(*fwd, {"a"});
+  auto &grad_mul = *backward;
+  ASSERT_EQ(grad_mul.type_, "mul_grad");
+  ASSERT_EQ(grad_mul.inputs_.size(), 2UL + 1UL + 1UL);
+  ASSERT_EQ(grad_mul.outputs_.size(), 2UL);
+  ASSERT_EQ(grad_mul.Output("A" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            f::OperatorBase::EMPTY_VAR_NAME());
+  ASSERT_EQ(grad_mul.Output("B" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            "b" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  ASSERT_EQ(grad_mul.Input("Out" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            "out" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  ASSERT_EQ(grad_mul.Input("A"), "a");
+  ASSERT_EQ(grad_mul.Input("B"), "b");
+  ASSERT_EQ(grad_mul.Input("Out"), "out");
+}
+
+TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
+  f::NetOp net;
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"x1", "w1", "b1"},
+                                    {"mul_out1", "add_out1", "out1"}, {}));
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"out1", "w2", "b2"},
+                                    {"mul_out2", "tmp_out2", "out2"}, {}));
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"out2", "w3", "b3"},
+                                    {"mul_out3", "tmp_out3", "out3"}, {}));
+  net.CompleteAddOp();
+  auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto bwd_net = static_cast<f::NetOp *>(backward.get());
+  ASSERT_EQ(bwd_net->ops_.size(), 3UL);
+  auto &grad_fc = *bwd_net->ops_[0];
+  EXPECT_EQ(grad_fc.inputs_.size(),
+            3UL       /* external input number */
+                + 1UL /* external output number*/
+                + 1UL /* number of gradient of external output*/
+                - 1UL /*ignoreGradient varable number*/
+                + 2U /* internal variable number*/);
+  EXPECT_EQ(grad_fc.outputs_.size(), 2UL       /* input number of mul*/
+                                         + 2UL /* input number of rowwise_add */
+                                         + 1UL /* input number of sigmod */);
+  EXPECT_EQ(bwd_net->ops_[1]->inputs_.size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[1]->outputs_.size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[2]->inputs_.size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[2]->outputs_.size(), 0UL);
+
+  /*
+    EXPECT_EQ(grad_fc.Output("X" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+              f::OperatorBase::EMPTY_VAR_NAME());
+  EXPECT_EQ(grad_fc.Output("W" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+    "w3" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  EXPECT_EQ(grad_fc.Output("b" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+    "b3" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  EXPECT_EQ(grad_fc.Output("mul_result" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+  "mul_out3" + f::OperatorBase::GRAD_VAR_SUFFIX());
+
+  EXPECT_EQ(grad_fc.Input("Out" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+  "out3" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  EXPECT_EQ(grad_fc.Input("X"), "out2");
+  EXPECT_EQ(grad_fc.Input("W"), "w3");
+  EXPECT_EQ(grad_fc.Input("mul_result"), "mul_out3");
+  EXPECT_EQ(grad_fc.Input("add_result"), "tmp_out3");
+  EXPECT_EQ(grad_fc.Input("Out"), "out3");
+  */
+}
diff --git a/paddle/framework/eigen.h b/paddle/framework/eigen.h
index 5f3358c69b3fbbbfcd97a96ab50fde3d8b9efad0..a4667cc51fadfc020d3211b7a82356db386fced1 100644
--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
@@ -80,5 +80,21 @@ struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
   }
 };
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenScalar {
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  using Type = Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
+  using ConstType = Eigen::TensorMap<
+      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
+
+  static Type From(Tensor& tensor) { return Type(tensor.data<T>()); }
+
+  static ConstType From(const Tensor& tensor) {
+    return ConstType(tensor.data<T>());
+  }
+};
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/eigen_test.cc b/paddle/framework/eigen_test.cc
index a9fa728e49a0dcc781e520a22c1ee5f921c4c733..dc1957691b1a202826e10e84c21ac8874df9e378 100644
--- a/paddle/framework/eigen_test.cc
+++ b/paddle/framework/eigen_test.cc
@@ -46,6 +46,17 @@ TEST(Eigen, Tensor) {
   }
 }
 
+TEST(Eigen, ScalarFrom) {
+  Tensor t;
+  int* p = t.mutable_data<int>(make_ddim({1}), platform::CPUPlace());
+  *p = static_cast<int>(100);
+
+  EigenScalar<int>::Type es = EigenScalar<int>::From(t);
+
+  ASSERT_EQ(0, es.dimension(0));
+  ASSERT_EQ(100, es(0));
+}
+
 TEST(Eigen, VectorFrom) {
   Tensor t;
   float* p = t.mutable_data<float>(make_ddim({6}), platform::CPUPlace());
diff --git a/paddle/framework/grad_op_builder.cc b/paddle/framework/grad_op_builder.cc
index 6235be75f27dadb65de663ff1b3caf26a649f6cb..dd686cc78246f06cdc3ec7d013086863d7e8fac0 100644
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@@ -20,7 +20,7 @@ namespace framework {
 
 OperatorBase* GradOpBuilder::Build() {
   BuildOpInOutArgList();
-  std::string grad_op_type = OpRegistry::grad_ops().at(op_->type_);
+  std::string grad_op_type = OpRegistry::grad_ops().at(op_.type_);
   OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)();
   grad_op->type_ = grad_op_type;
   CompleteGradOp(grad_op);
@@ -39,15 +39,15 @@ OpInOutArg* GradOpBuilder::BuildArg(const VarProto& var,
 }
 
 void GradOpBuilder::BuildOpInOutArgList() {
-  const OpProto& op_proto = OpRegistry::protos().at(op_->type_);
-  const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_->type_));
+  const OpProto& op_proto = OpRegistry::protos().at(op_.type_);
+  const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_.type_));
   const std::vector<int>& in_format =
-      op_->attrs_.count("input_format")
-          ? op_->GetAttr<std::vector<int>>("input_format")
+      op_.attrs_.count("input_format")
+          ? op_.GetAttr<std::vector<int>>("input_format")
           : std::vector<int>();
   const std::vector<int>& out_format =
-      op_->attrs_.count("output_format")
-          ? op_->GetAttr<std::vector<int>>("output_format")
+      op_.attrs_.count("output_format")
+          ? op_.GetAttr<std::vector<int>>("output_format")
           : std::vector<int>();
   for (const auto& var : op_proto.inputs()) {
     arg_list_.emplace_back(
@@ -70,8 +70,7 @@ void GradOpBuilder::AddArgIntoGradOp(const OpInOutArg* arg,
   }
   (*varmap)[var_name] = idx++;
   size_t pre_sz = in_out.size();
-  auto base_it =
-      arg->type_ == IN ? op_->inputs_.begin() : op_->outputs_.begin();
+  auto base_it = arg->type_ == IN ? op_.inputs_.begin() : op_.outputs_.begin();
   std::copy(base_it + arg->begin_idx_, base_it + arg->end_idx_,
             std::back_inserter(in_out));
   if (is_grad) {
@@ -83,7 +82,7 @@ void GradOpBuilder::AddArgIntoGradOp(const OpInOutArg* arg,
 }
 
 void GradOpBuilder::CompleteGradOp(OperatorBase* grad_op) const {
-  grad_op->attrs_ = op_->attrs_;
+  grad_op->attrs_ = op_.attrs_;
   grad_op->attrs_.erase("input_format");
   grad_op->attrs_.erase("output_format");
   VarIndexMap* grad_varmap = new VarIndexMap();
diff --git a/paddle/framework/grad_op_builder.h b/paddle/framework/grad_op_builder.h
index 2ecf39479b4f4a51f89cd500caf851897df0e599..cc7a76f3726e00a08fbe06bca4c9b9f5bad466b4 100644
--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/grad_op_builder.h
@@ -29,7 +29,7 @@ class GradOpBuilder {
   using VarIndexMap = std::unordered_map<std::string, int>;
 
  public:
-  GradOpBuilder(const OperatorBase* op) : op_(op) {}
+  GradOpBuilder(const OperatorBase& op) : op_(op) {}
   OperatorBase* Build();
 
  private:
@@ -40,7 +40,7 @@ class GradOpBuilder {
                         std::vector<int>& format, VarIndexMap* varmap, int& idx,
                         bool is_grad) const;
   void CompleteGradOp(OperatorBase* grad_op) const;
-  const OperatorBase* op_;
+  const OperatorBase& op_;
   std::vector<std::shared_ptr<OpInOutArg>> arg_list_;
 };
 
diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
index 288a7841cd7c9212d8fa230e38d49dfc26e76256..e9cf3b9798db2cbfb8d26259ae9a6741fbae8278 100644
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -11,7 +11,7 @@ namespace framework {
 TEST(GradOpBuilder, AddTwo) {
   std::shared_ptr<OperatorBase> add_op(
       OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
-  std::shared_ptr<OperatorBase> grad_add_op = OpRegistry::CreateGradOp(add_op);
+  std::shared_ptr<OperatorBase> grad_add_op = OpRegistry::CreateGradOp(*add_op);
   EXPECT_EQ(static_cast<int>(grad_add_op->inputs_.size()), 4);
   EXPECT_EQ(static_cast<int>(grad_add_op->outputs_.size()), 2);
   EXPECT_EQ(grad_add_op->Input("X"), "x");
diff --git a/paddle/framework/images/duplicate_op.graffle b/paddle/framework/images/duplicate_op.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..5979f792e252f028a615729215529c2be42d9165
Binary files /dev/null and b/paddle/framework/images/duplicate_op.graffle differ
diff --git a/paddle/framework/images/duplicate_op.png b/paddle/framework/images/duplicate_op.png
new file mode 100644
index 0000000000000000000000000000000000000000..f299c5d37f260a1bb0daec886f0a4ee1c1f31c92
Binary files /dev/null and b/paddle/framework/images/duplicate_op.png differ
diff --git a/paddle/framework/images/duplicate_op2.graffle b/paddle/framework/images/duplicate_op2.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..2b658085d6a55d368c320051ba7f94ec2900f13c
Binary files /dev/null and b/paddle/framework/images/duplicate_op2.graffle differ
diff --git a/paddle/framework/images/duplicate_op2.png b/paddle/framework/images/duplicate_op2.png
new file mode 100644
index 0000000000000000000000000000000000000000..c5588015d1450fd8c1bda3580680d884494868bb
Binary files /dev/null and b/paddle/framework/images/duplicate_op2.png differ
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 089c1355951f59d51db16d4b4bdce4282d6e5c25..acf1a69da9fd8adce1bd89367c882eade052e725 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -43,7 +43,7 @@ class NetOp : public OperatorBase {
    * Infer all the operators' input and output variables' shapes, will be called
    * before every mini-batch
    */
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {
+  void InferShape(const Scope& scope) const override {
     for (auto& op : ops_) {
       op->InferShape(scope);
     }
@@ -56,7 +56,7 @@ class NetOp : public OperatorBase {
    * scope will be used instead. If no OpContext is provicded, default context
    * will be used.
    */
-  void Run(const std::shared_ptr<Scope>& scope,
+  void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {
     for (auto& op : ops_) {
       op->Run(scope, dev_ctx);
@@ -68,9 +68,18 @@ class NetOp : public OperatorBase {
    */
   void AddOp(const std::shared_ptr<OperatorBase>& op) {
     PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
+    PADDLE_ENFORCE(op != nullptr, "Cannot Insert Null op");
     ops_.push_back(op);
   }
 
+  void InsertOp(size_t pos, const std::shared_ptr<OperatorBase>& op) {
+    PADDLE_ENFORCE(!add_op_done_,
+                   "Cannot InsertOp when this network is sealed");
+    PADDLE_ENFORCE(op != nullptr, "Cannot Insert Null op");
+    PADDLE_ENFORCE(pos <= ops_.size(), "Out of range");
+    ops_.insert(ops_.begin() + pos, op);
+  }
+
   void CompleteAddOp(bool calculate = true);
 
   std::string DebugString() const override;
diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
index 8048311fe54ee1827fb5b91577478a1d30803e43..f32e456e5d142bf8203f9ec03e8059772c4f5c99 100644
--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
@@ -3,11 +3,6 @@
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/operator.h>
 
-USE_OP(add_two);
-USE_OP(mul);
-USE_OP(sigmoid);
-USE_OP(softmax);
-
 namespace paddle {
 namespace framework {
 
@@ -16,16 +11,22 @@ static int run_cnt = 0;
 
 class TestOp : public OperatorBase {
  public:
-  void InferShape(
-      const std::shared_ptr<framework::Scope>& scope) const override {
+  void InferShape(const framework::Scope& scope) const override {
     ++infer_shape_cnt;
   }
-  void Run(const std::shared_ptr<framework::Scope>& scope,
+  void Run(const framework::Scope& scope,
            const paddle::platform::DeviceContext& dev_ctx) const override {
     ++run_cnt;
   }
 };
 
+class EmptyOp : public OperatorBase {
+ public:
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+};
+
 template <typename T>
 void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
                                   const std::vector<T>& actual) {
@@ -62,7 +63,7 @@ TEST(OpKernel, all) {
   ASSERT_EQ(1UL, tmp_idx.size());
   ASSERT_EQ("y", net->outputs_[tmp_idx[0]]);
 
-  auto scope = std::make_shared<Scope>();
+  Scope scope;
   platform::CPUDeviceContext dev_ctx;
 
   net->InferShape(scope);
@@ -72,20 +73,17 @@ TEST(OpKernel, all) {
   ASSERT_THROW(net->AddOp(op2), paddle::platform::EnforceNotMet);
 }
 
-//! TODO(yuyang18): Refine Backward Op.
-// TEST(AddBackwardOp, TestGradOp) {
-//  auto net = std::make_shared<NetOp>();
-//  ASSERT_NE(net, nullptr);
-//  net->AddOp(framework::OpRegistry::CreateOp("mul", {"X", "Y"}, {"Out"}, {}));
-//  net->AddOp(
-//      framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {}));
-//  net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""},
-//  {}));
-//  auto grad_ops = AddBackwardOp(net);
-//  for (auto& op : grad_ops->ops_) {
-//    op->DebugString();
-//  }
-//}
+TEST(Net, insert_op) {
+  NetOp net;
+  auto op1 = std::make_shared<EmptyOp>();
+  op1->inputs_ = {"x", "w1", "b1"};
+  op1->outputs_ = {"y"};
+  net.AddOp(op1);
+  net.InsertOp(0, op1);
+  ASSERT_EQ(2UL, net.ops_.size());
+  net.InsertOp(2, op1);
+  ASSERT_EQ(3UL, net.ops_.size());
+}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 384f0f631dd9b9a4dd7c0c628340afe668bc248f..f10c9297981a4c6aefc6c2072d0ac2b8e562a7a0 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -86,43 +86,46 @@ class OpProtoAndCheckerMaker {
   }
 
  protected:
-  void AddInput(const std::string& name, const std::string& comment,
-                bool multiple = false, bool ignore_gradient = false) {
+  struct VariableBuilder {
+    VarProto* var_;
+    std::function<void()> on_multiple_;
+    std::function<void()> on_temporary_;
+
+    VariableBuilder& SetMultiple() {
+      var_->set_multiple(true);
+      on_multiple_();
+      return *this;
+    }
+
+    VariableBuilder& SetTemporary() {
+      PADDLE_ENFORCE(bool(on_temporary_), "Cannot set temporary");
+      var_->set_temporary(true);
+      on_temporary_();
+      return *this;
+    }
+
+    VariableBuilder& IgnoreGradient() {
+      var_->set_ignore_gradient(true);
+      return *this;
+    }
+  };
+
+  VariableBuilder AddInput(const std::string& name,
+                           const std::string& comment) {
     auto input = proto_->mutable_inputs()->Add();
     *input->mutable_name() = name;
     *input->mutable_comment() = comment;
-    input->set_ignore_gradient(ignore_gradient);
-    input->set_multiple(multiple);
-    if (multiple) {
-      SetHasMultipleInput();
-    }
-  }
-
-  void AddInputs(const std::string& name, const std::string& comment,
-                 bool ignore_gradient = false) {
-    AddInput(name, comment, true, ignore_gradient);
+    return VariableBuilder{input, [=] { this->SetHasMultipleInput(); },
+                           nullptr};
   }
 
-  void AddOutput(const std::string& name, const std::string& comment,
-                 bool temporary = false, bool multiple = false,
-                 bool ignore_gradient = false) {
+  VariableBuilder AddOutput(const std::string& name,
+                            const std::string& comment) {
     auto output = proto_->mutable_outputs()->Add();
     *output->mutable_name() = name;
     *output->mutable_comment() = comment;
-    output->set_ignore_gradient(ignore_gradient);
-    output->set_multiple(multiple);
-    if (multiple) {
-      SetHasMultipleOutput();
-    }
-    output->set_temporary(temporary);
-    if (temporary) {
-      SetHasTemporaryOutput();
-    }
-  }
-
-  void AddOutputs(const std::string& name, const std::string& comment,
-                  bool temporary = false, bool ignore_gradient = false) {
-    AddOutput(name, comment, temporary, true, ignore_gradient);
+    return VariableBuilder{output, [=] { this->SetHasMultipleOutput(); },
+                           [=] { this->SetHasTemporaryOutput(); }};
   }
 
   template <typename T>
@@ -300,9 +303,10 @@ class OpRegistry {
     return CreateOp(op_desc.type(), inputs, outputs, attrs);
   }
 
-  static std::shared_ptr<OperatorBase> CreateGradOp(
-      std::shared_ptr<OperatorBase> op) {
-    GradOpBuilder builder(op.get());
+  static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) {
+    PADDLE_ENFORCE(!op.IsNetOp(),
+                   "Use framework::Backward to get backward ops");
+    GradOpBuilder builder(op);
     std::shared_ptr<OperatorBase> grad_op(builder.Build());
     grad_op->Init();
     return grad_op;
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 2ef781bf8672c8aa53ae32a44f1ea61973f3792c..9894928a7aa19bc6c7ad8b230562fb9a681cfebd 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -7,9 +7,9 @@ namespace paddle {
 namespace framework {
 class CosineOp : public OperatorBase {
  public:
-  void Run(const std::shared_ptr<Scope>& scope,
+  void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {}
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void InferShape(const Scope& scope) const override {}
 };
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -27,8 +27,8 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 
 class MyTestOp : public OperatorBase {
  public:
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {}
 };
 
@@ -36,9 +36,8 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
   MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInputs("input", "input of cosine op");
-    AddOutput("output", "output of cosine op",
-              /*temporary*/ true);
+    AddInput("input", "input of cosine op").SetMultiple();
+    AddOutput("output", "output of cosine op").SetTemporary();
     auto my_checker = [](int i) {
       PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
     };
@@ -69,7 +68,7 @@ TEST(OpRegistry, CreateOp) {
 
   std::shared_ptr<paddle::framework::OperatorBase> op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  auto scope = std::make_shared<paddle::framework::Scope>();
+  paddle::framework::Scope scope;
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
   float scale_get = op->GetAttr<float>("scale");
@@ -111,7 +110,7 @@ TEST(OpRegistry, DefaultValue) {
 
   std::shared_ptr<paddle::framework::OperatorBase> op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  auto scope = std::make_shared<paddle::framework::Scope>();
+  paddle::framework::Scope scope;
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
   ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
@@ -173,7 +172,7 @@ TEST(OpRegistry, CustomChecker) {
   SetInputFormat(&op_desc);
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::platform::CPUDeviceContext dev_ctx;
-  auto scope = std::make_shared<paddle::framework::Scope>();
+  paddle::framework::Scope scope;
   op->Run(scope, dev_ctx);
   int test_attr = op->GetAttr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 3a1ffc02151f42a4fe6f103925ab424251ee8d85..e3c510b70346a2baf6ccd756eaf689c146efee5f 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace framework {
 
 template <>
-Eigen::DefaultDevice* KernelContext::GetEigenDevice<
+Eigen::DefaultDevice* ExecutionContext::GetEigenDevice<
     platform::CPUPlace, Eigen::DefaultDevice>() const {
   return device_context_.get_eigen_device<Eigen::DefaultDevice>();
 }
@@ -28,7 +28,7 @@ Eigen::DefaultDevice* KernelContext::GetEigenDevice<
 #ifndef PADDLE_ONLY_CPU
 template <>
 Eigen::GpuDevice*
-KernelContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
+ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
   return device_context_.get_eigen_device<Eigen::GpuDevice>();
 }
 #endif
@@ -52,7 +52,7 @@ std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
   PADDLE_ENFORCE(in_out_idxs_ != nullptr, "IO Idx could not be nullptr");
   auto input_format = GetAttr<std::vector<int>>("input_format");
   auto offset = in_out_idxs_->at(name);
-  PADDLE_ENFORCE(input_format.at((size_t)offset + 1) <= inputs_.size(),
+  PADDLE_ENFORCE(input_format.at((size_t)offset + 1) <= (int)inputs_.size(),
                  "Input Out Of Range");
 
   return std::vector<std::string>{
@@ -78,7 +78,7 @@ std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
   PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr");
   auto output_format = GetAttr<std::vector<int>>("output_format");
   auto offset = in_out_idxs_->at(name);
-  PADDLE_ENFORCE(output_format.at((size_t)offset + 1) <= outputs_.size(),
+  PADDLE_ENFORCE(output_format.at((size_t)offset + 1) <= (int)outputs_.size(),
                  "Output Out of Range");
   return std::vector<std::string>{
       outputs_.begin() + output_format.at(offset),
@@ -105,5 +105,11 @@ std::string OperatorBase::DebugString() const {
   return ss.str();
 }
 
+void OperatorBase::Rename(const std::string& old_name,
+                          const std::string& new_name) {
+  std::replace(inputs_.begin(), inputs_.end(), old_name, new_name);
+  std::replace(outputs_.begin(), outputs_.end(), old_name, new_name);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 0a8c82ee47521713fa96cb423ceca4de858c260c..6a9fe19b9b61333cf9db1cca3e34c72f3f9c99c5 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <boost/variant.hpp>
 #include <string>
 #include <unordered_map>
@@ -31,22 +32,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-template <typename T>
-struct EigenDeviceConverter;
-
-template <>
-struct EigenDeviceConverter<platform::CPUPlace> {
-  using EigenDeviceType = Eigen::DefaultDevice;
-};
-
-#ifndef PADDLE_ONLY_CPU
-template <>
-struct EigenDeviceConverter<platform::GPUPlace> {
-  using EigenDeviceType = Eigen::GpuDevice;
-};
-#endif
-
 class OperatorBase;
+class InferShapeContext;
+class ExecutionContext;
 /**
  * OperatorBase has the basic element that Net will call to do computation.
  * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -67,6 +55,9 @@ class OperatorBase {
   /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
   static std::string GRAD_VAR_SUFFIX() { return "@GRAD"; }
 
+  /// Variables with this suffix are supposed to be filled up with zeros.
+  static std::string ZERO_VAR_SUFFIX() { return "@ZERO"; }
+
   virtual ~OperatorBase() {}
 
   template <typename T>
@@ -84,16 +75,20 @@ class OperatorBase {
 
   /// InferShape infer the size of Variables used by this Operator with
   /// information inside scope
-  virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
+  virtual void InferShape(const Scope& scope) const = 0;
 
   /// Net will call this function to Run an op.
-  virtual void Run(const std::shared_ptr<Scope>& scope,
+  virtual void Run(const Scope& scope,
                    const platform::DeviceContext& dev_ctx) const = 0;
 
   virtual bool IsNetOp() const { return false; }
 
+  /// rename inputs outputs name
+  void Rename(const std::string& old_name, const std::string& new_name);
+
   //! Get a input with argument's name described in `op_proto`
   const std::string& Input(const std::string& name) const;
+
   //! Get a input which has multiple variables.
   //! TODO add a vector_view to prevent memory copy.
   std::vector<std::string> Inputs(const std::string& name) const;
@@ -105,53 +100,140 @@ class OperatorBase {
 
  public:
   std::string type_;
+  // NOTE: in case of OpGrad, inputs_ contains:
+  // I (Inputs)
+  // O (Outputs)
+  // OG (Output Gradients)
   std::vector<std::string> inputs_;
+  // NOTE: in case of OpGrad, outputs_ contains
+  // IG (Inputs Gradients)
   std::vector<std::string> outputs_;
   AttributeMap attrs_;
   // store the arguments' offset described in op_desc.
   std::shared_ptr<std::unordered_map<std::string, int>> in_out_idxs_;
 };
 
-class KernelContext {
+class OperatorContext {
  public:
-  KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
-                const platform::DeviceContext& device_context)
-      : op_(*op), scope_(scope), device_context_(device_context) {}
+  OperatorContext(const OperatorBase* op, const Scope& scope)
+      : op_(*op), scope_(scope) {}
+
+  size_t InputSize() const { return op_.inputs_.size(); }
 
-  const Variable* Input(int index) const {
-    return scope_->GetVariable(op_.inputs_[index]);
+  size_t OutputSize() const { return op_.outputs_.size(); }
+
+  const Variable* InputVar(const size_t index) const {
+    return scope_.FindVar(op_.inputs_.at(index));
   }
 
-  Variable* Output(int index) const {
-    return scope_->GetVariable(op_.outputs_[index]);
+  Variable* OutputVar(const size_t index) const {
+    return scope_.FindVar(op_.outputs_.at(index));
   }
 
-  const Variable* Input(const std::string& name) const {
-    return scope_->GetVariable(op_.Input(name));
+  const Variable* InputVar(const std::string& name) const {
+    return scope_.FindVar(op_.Input(name));
   }
 
-  const Variable* Output(const std::string& name) const {
-    return scope_->GetVariable(op_.Output(name));
+  Variable* OutputVar(const std::string& name) const {
+    return scope_.FindVar(op_.Output(name));
   }
 
-  const std::vector<const Variable*> Inputs(const std::string& name) const {
+  const std::vector<const Variable*> MultiInputVar(
+      const std::string& name) const {
     auto names = op_.Inputs(name);
     std::vector<const Variable*> res;
+    res.reserve(names.size());
     std::transform(
-        names.begin(), names.end(), res.begin(),
-        [this](const std::string& name) { return scope_->GetVariable(name); });
+        names.begin(), names.end(), std::back_inserter(res),
+        [this](const std::string& name) { return scope_.FindVar(name); });
     return res;
   }
 
-  const std::vector<const Variable*> Outputs(const std::string& name) const {
+  std::vector<const Variable*> MultiOutputVar(const std::string& name) const {
     auto names = op_.Outputs(name);
     std::vector<const Variable*> res;
+    res.reserve(names.size());
     std::transform(
-        names.begin(), names.end(), res.begin(),
-        [this](const std::string& name) { return scope_->GetVariable(name); });
+        names.begin(), names.end(), std::back_inserter(res),
+        [this](const std::string& name) { return scope_.FindVar(name); });
+    return res;
+  }
+
+  template <typename T>
+  const T* Input(const size_t index) const {
+    return &(InputVar(index)->Get<T>());
+  }
+
+  template <typename T>
+  T* Output(const size_t index) const {
+    return OutputVar(index)->GetMutable<T>();
+  }
+
+  template <typename T>
+  const T* Input(const std::string& name) const {
+    return &(InputVar(name)->Get<T>());
+  }
+
+  template <typename T>
+  T* Output(const std::string& name) const {
+    return OutputVar(name)->GetMutable<T>();
+  }
+
+  template <typename T>
+  const std::vector<const T*> MultiInput(const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const T*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [this](const std::string& name) {
+                     return &scope_.FindVar(name)->Get<T>();
+                   });
+    return res;
+  }
+
+  template <typename T>
+  std::vector<const T*> MultiOutput(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<const T*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [this](const std::string& name) {
+                     return scope_.FindVar(name)->GetMutable<T>();
+                   });
     return res;
   }
 
+  const OperatorBase& op_;
+  const Scope& scope_;
+};
+
+class InferShapeContext : public OperatorContext {
+ public:
+  InferShapeContext(const OperatorBase* op, const Scope& scope)
+      : OperatorContext(op, scope) {}
+};
+
+template <typename T>
+struct EigenDeviceConverter;
+
+template <>
+struct EigenDeviceConverter<platform::CPUPlace> {
+  using EigenDeviceType = Eigen::DefaultDevice;
+};
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+struct EigenDeviceConverter<platform::GPUPlace> {
+  using EigenDeviceType = Eigen::GpuDevice;
+};
+#endif
+
+class ExecutionContext : public OperatorContext {
+ public:
+  ExecutionContext(const OperatorBase* op, const Scope& scope,
+                   const platform::DeviceContext& device_context)
+      : OperatorContext(op, scope), device_context_(device_context) {}
+
   template <typename PlaceType,
             typename DeviceType =
                 typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
@@ -159,38 +241,23 @@ class KernelContext {
 
   platform::Place GetPlace() const { return device_context_.GetPlace(); }
 
-  const OperatorBase& op_;
-  const std::shared_ptr<Scope>& scope_;
   const platform::DeviceContext& device_context_;
 };
 
 class OpKernel {
  public:
   /**
-   * KernelContext is the only parameter of Kernel Run function.
+   * ExecutionContext is the only parameter of Kernel Run function.
    * Run will get input/output variables, state such as momentum and
    * device resource such as CUDA stream, cublas handle, etc. from
-   * KernelContext. User should construct it before run the Operator.
+   * ExecutionContext. User should construct it before run the Operator.
    */
 
-  virtual void Compute(const KernelContext& context) const = 0;
+  virtual void Compute(const ExecutionContext& context) const = 0;
 
   virtual ~OpKernel() {}
 };
 
-template <typename T>
-struct VarToTensor {};
-
-template <>
-struct VarToTensor<Tensor*> {
-  Tensor* operator()(Variable* var) { return var->GetMutable<Tensor>(); }
-};
-
-template <>
-struct VarToTensor<const Tensor*> {
-  const Tensor* operator()(Variable* var) { return &var->Get<Tensor>(); }
-};
-
 class OperatorWithKernel : public OperatorBase {
  public:
   struct OpKernelKey {
@@ -216,10 +283,14 @@ class OperatorWithKernel : public OperatorBase {
   using OpKernelMap =
       std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
 
-  void Run(const std::shared_ptr<Scope>& scope,
+  void InferShape(const Scope& scope) const {
+    InferShape(InferShapeContext(this, scope));
+  }
+
+  void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const final {
     auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(KernelContext(this, scope, dev_ctx));
+    opKernel->Compute(ExecutionContext(this, scope, dev_ctx));
   }
 
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@@ -228,34 +299,8 @@ class OperatorWithKernel : public OperatorBase {
     return g_all_op_kernels;
   }
 
-  void InferShape(const std::shared_ptr<Scope>& scope) const final {
-    std::vector<const Tensor*> ins;
-    VarNamesToTensors(scope, inputs_, &ins);
-    std::vector<Tensor*> outs;
-    VarNamesToTensors(scope, outputs_, &outs);
-    InferShape(ins, outs);
-  };
-
- private:
-  template <typename T>
-  void VarNamesToTensors(const std::shared_ptr<Scope>& scope,
-                         const std::vector<std::string>& var_names,
-                         std::vector<T>* container) const {
-    container->reserve(var_names.size());
-    VarToTensor<T> convert;
-    for (auto& name : var_names) {
-      auto var = scope->GetVariable(name);
-      if (var != nullptr) {
-        container->push_back(convert(var));
-      } else {
-        container->push_back(nullptr);
-      }
-    }
-  }
-
  protected:
-  virtual void InferShape(const std::vector<const Tensor*>& inputs,
-                          const std::vector<Tensor*>& outputs) const = 0;
+  virtual void InferShape(const InferShapeContext& ctx) const = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 3fae356c3e5d5b44271440b66d6923fd4994b937..6a6a802b7da05c37a317540030836baa28a89cd7 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -24,15 +24,15 @@ static int op_run_num = 0;
 class OpWithoutKernelTest : public OperatorBase {
  public:
   void Init() override { x = 1; }
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {
     op_run_num++;
     ASSERT_EQ((int)inputs_.size(), 1);
     ASSERT_EQ((int)outputs_.size(), 1);
-    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_EQ(scope.FindVar(inputs_[0]), nullptr);
     ASSERT_EQ(x, 1);
-    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
+    ASSERT_NE(scope.FindVar(outputs_[0]), nullptr);
   }
 
  public:
@@ -68,11 +68,12 @@ TEST(OperatorBase, all) {
   attr->set_f(3.14);
 
   paddle::platform::CPUDeviceContext device_context;
-  auto scope = std::make_shared<paddle::framework::Scope>();
+  paddle::framework::Scope scope;
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  scope->CreateVariable("OUT1");
+  scope.NewVar("OUT1");
   ASSERT_EQ(paddle::framework::op_run_num, 0);
+  op->InferShape(scope);
   op->Run(scope, device_context);
   ASSERT_EQ(paddle::framework::op_run_num, 1);
 }
@@ -97,14 +98,13 @@ static int cpu_kernel_run_num = 0;
 
 class OpWithKernelTest : public OperatorWithKernel {
  protected:
-  void InferShape(const std::vector<const Tensor*>& inputs,
-                  const std::vector<Tensor*>& outputs) const override {}
+  void InferShape(const framework::InferShapeContext& ctx) const override {}
 };
 
 template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel {
  public:
-  void Compute(const KernelContext& ctx) const {
+  void Compute(const ExecutionContext& ctx) const {
     std::cout << "this is cpu kernel" << std::endl;
     std::cout << ctx.op_.DebugString() << std::endl;
     cpu_kernel_run_num++;
@@ -117,12 +117,12 @@ class CPUKernelTest : public OpKernel {
 class OperatorMultiInputsTest : public OperatorBase {
  public:
   void Init() override { x = 1; }
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {
-    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_EQ(scope.FindVar(inputs_[0]), nullptr);
     ASSERT_EQ(x, 1);
-    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
+    ASSERT_NE(scope.FindVar(outputs_[0]), nullptr);
     ASSERT_EQ(Input("x"), "IN1");
     ASSERT_EQ(Input("y"), "OUT1");
   }
@@ -137,9 +137,9 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
   OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
                                               OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInputs("xs", "inputs of test op");
+    AddInput("xs", "inputs of test op").SetMultiple();
     AddInput("k", "input of test op");
-    AddOutputs("ys", "outputs of test op");
+    AddOutput("ys", "outputs of test op").SetMultiple();
     AddAttr<float>("scale", "scale of cosine op")
         .SetDefault(1.0)
         .LargerThan(0.0);
@@ -149,13 +149,31 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
 
 class CPUKernalMultiInputsTest : public OpKernel {
  public:
-  void Compute(const KernelContext& ctx) const {
+  void Compute(const ExecutionContext& ctx) const {
     auto xs = ctx.op_.Inputs("xs");
     ASSERT_EQ(xs.size(), 3UL);
     ASSERT_EQ(xs[0], "x0");
     ASSERT_EQ(xs[1], "x1");
     ASSERT_EQ(xs[2], "x2");
 
+    auto inVar0 = ctx.MultiInputVar("xs");
+    ASSERT_EQ(inVar0.size(), 3);
+
+    auto intVar1 = ctx.InputVar("k");
+    ASSERT_NE(intVar1, nullptr);
+
+    auto outVar0 = ctx.MultiOutputVar("ys");
+    ASSERT_EQ(outVar0.size(), 2);
+
+    auto inTensor0 = ctx.MultiInput<Tensor>("xs");
+    ASSERT_EQ(inTensor0.size(), 3);
+
+    auto intTensor1 = ctx.Input<Tensor>("k");
+    ASSERT_NE(intTensor1, nullptr);
+
+    auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
+    ASSERT_EQ(outTensor0.size(), 2);
+
     auto k = ctx.op_.Input("k");
     ASSERT_EQ(k, "k0");
 
@@ -186,7 +204,7 @@ TEST(OpKernel, all) {
   attr->set_f(3.14);
 
   paddle::platform::CPUDeviceContext cpu_device_context;
-  auto scope = std::make_shared<paddle::framework::Scope>();
+  paddle::framework::Scope scope;
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
@@ -232,7 +250,13 @@ TEST(OpKernel, multi_inputs) {
   output_format->Add(2);  // y1
 
   paddle::platform::CPUDeviceContext cpu_device_context;
-  auto scope = std::make_shared<Scope>();
+  paddle::framework::Scope scope;
+  scope.NewVar("x0")->GetMutable<Tensor>();
+  scope.NewVar("x1")->GetMutable<Tensor>();
+  scope.NewVar("x2")->GetMutable<Tensor>();
+  scope.NewVar("k0")->GetMutable<Tensor>();
+  scope.NewVar("y0")->GetMutable<Tensor>();
+  scope.NewVar("y1")->GetMutable<Tensor>();
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   op->Run(scope, cpu_device_context);
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
new file mode 100644
index 0000000000000000000000000000000000000000..080b4ac621c1b8c0d4b4e7b26f394cf2be263894
--- /dev/null
+++ b/paddle/framework/scope.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/scope.h"
+#include "paddle/string/printf.h"
+
+namespace paddle {
+namespace framework {
+
+Scope::~Scope() {
+  DropKids();
+  for (auto& kv : vars_) delete kv.second;
+}
+
+Scope& Scope::NewScope() const {
+  kids_.push_back(new Scope(this));
+  return *kids_.back();
+}
+
+Variable* Scope::NewVar(const std::string& name) {
+  auto iter = vars_.find(name);
+  if (iter != vars_.end()) {
+    return iter->second;
+  }
+  Variable* v = new Variable();
+  vars_[name] = v;
+  v->name_ = &(vars_.find(name)->first);
+  return v;
+}
+
+Variable* Scope::NewVar() {
+  return NewVar(string::Sprintf("%p.%d", this, vars_.size()));
+}
+
+Variable* Scope::FindVar(const std::string& name) const {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) return it->second;
+  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+}
+
+const Scope* Scope::FindScope(const Variable* var) const {
+  for (auto& kv : vars_) {
+    if (kv.second == var) {
+      return this;
+    }
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+}
+void Scope::DropKids() {
+  for (Scope* s : kids_) delete s;
+  kids_.clear();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index 4faaf841440ba30b79c83d09fea977186bd0270a..2ba3f8ed355b48800cfa4180e4e8a94f2c9958a9 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <list>
 #include <string>
 #include <unordered_map>
-#include <vector>
 
 #include "paddle/framework/variable.h"
 
@@ -35,73 +35,42 @@ class Scope;
  */
 class Scope {
  public:
-  /**
-   * @brief Initialize s Scope without parent.
-   */
   Scope() {}
+  ~Scope();
 
-  /**
-   * @brief Initialize a Scope with parent.
-   */
-  explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
-
-  /**
-   * @brief Create Variable
-   *
-   * Create Variable in this Scope. Return the exist one if Variable already
-   * been created.
-   */
-  Variable* CreateVariable(const std::string& name) {
-    auto var = GetVariable(name);
-    if (var) {
-      return var;
-    } else {
-      auto ptr = new Variable();
-      name_to_var_[name] = std::unique_ptr<Variable>(ptr);
-      var_to_name_[ptr] = name;
-      return GetVariable(name);
-    }
-  }
-
-  /**
-   * @brief Get Variable.
-   *
-   * Get Variable from this Scope, this function will recursive find Variable
-   * from it's parent scope. Return nullptr if not found.
-   */
-  Variable* GetVariable(const std::string& name) const {
-    auto it = name_to_var_.find(name);
-    if (it != name_to_var_.end()) {
-      return it->second.get();
-    } else if (parent_ != nullptr) {
-      return parent_->GetVariable(name);
-    } else {
-      return nullptr;
-    }
-  }
-
-  /**
-   * @brief If this scope has a Var named name.
-   *
-   * Find if there is a Variable in this scope and it's parent scope
-   */
-  bool HasVariable(const std::string& name) const {
-    return (name_to_var_.find(name) != name_to_var_.end() ||
-            (parent_ && parent_->HasVariable(name)));
-  }
-
-  std::string GetVariableName(Variable* const var) const {
-    try {
-      return var_to_name_.at(var);
-    } catch (...) {
-      return "";
-    }
-  }
+  // Disable Copy, Assign, Move.
+  Scope(const Scope& other) = delete;
+  Scope& operator=(const Scope& other) = delete;
+  Scope(Scope&& other) = delete;
+
+  /// Create a sub-scope. Returns a reference other than a pointer so
+  /// to prevent from manual deletion.
+  /// Mark it to const because that new kid scope cannot change parent scope.
+  Scope& NewScope() const;
+
+  /// Create a variable with given name if it doesn't exist.
+  Variable* NewVar(const std::string& name);
+
+  /// Create a variable with a scope-unique name.
+  Variable* NewVar();
+
+  /// Find a variable in the scope or any of its ancestors.  Returns
+  /// nullptr if cannot find.
+  Variable* FindVar(const std::string& name) const;
+
+  /// Find the scope or an ancestor scope that contains the given variable.
+  const Scope* FindScope(const Variable* var) const;
+
+  /// Drop all kids scopes belonged to this scope.
+  void DropKids();
 
  private:
-  std::unordered_map<Variable*, std::string> var_to_name_;
-  std::unordered_map<std::string, std::unique_ptr<Variable>> name_to_var_;
-  std::shared_ptr<Scope> parent_{nullptr};
+  // Call Scope::NewScope for a sub-scope.
+  explicit Scope(Scope const* parent) : parent_(parent) {}
+
+  std::unordered_map<std::string, Variable*> vars_;
+  mutable std::list<Scope*> kids_;
+  Scope const* parent_{nullptr};
 };
 
 }  // namespace framework
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
index ff069c7be002e9bcfd63225c3d80aa958935ba14..9d51e355b0f6336d2f875ff2d77266b261baf5ac 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -15,49 +15,42 @@ limitations under the License. */
 #include "paddle/framework/scope.h"
 #include "gtest/gtest.h"
 
-TEST(Scope, Create) {
-  using paddle::framework::Scope;
-  using paddle::framework::Variable;
+using paddle::framework::Scope;
+using paddle::framework::Variable;
 
-  auto scope = std::make_shared<Scope>();
+TEST(Scope, VarsShadowing) {
+  Scope s;
+  Scope& ss1 = s.NewScope();
+  Scope& ss2 = s.NewScope();
 
-  Variable* var0 = scope->CreateVariable("");
-  EXPECT_NE(var0, nullptr);
+  Variable* v0 = s.NewVar("a");
+  Variable* v1 = ss1.NewVar("a");
 
-  /// GetVariable will return nullptr if not exist.
-  Variable* var1 = scope->GetVariable("a");
-  EXPECT_EQ(var1, nullptr);
+  EXPECT_NE(v0, v1);
 
-  /// CreateVariable will return one.
-  Variable* var2 = scope->CreateVariable("a");
-  EXPECT_NE(var2, nullptr);
-
-  /// Get the created variable.
-  Variable* var3 = scope->GetVariable("a");
-  EXPECT_EQ(var2, var3);
+  EXPECT_EQ(v0, s.FindVar("a"));
+  EXPECT_EQ(v1, ss1.FindVar("a"));
+  EXPECT_EQ(v0, ss2.FindVar("a"));
+}
 
-  /// CreateVariable will just return the variable if it's
-  /// already exist.
-  Variable* var4 = scope->CreateVariable("a");
-  EXPECT_EQ(var4, var2);
+TEST(Scope, FindVar) {
+  Scope s;
+  Scope& ss = s.NewScope();
 
-  EXPECT_EQ("a", scope->GetVariableName(var4));
-  Scope scope2;
-  auto var = scope2.CreateVariable("tmp");
-  EXPECT_EQ("", scope->GetVariableName(var));
-}
+  EXPECT_EQ(nullptr, s.FindVar("a"));
+  EXPECT_EQ(nullptr, ss.FindVar("a"));
 
-TEST(Scope, Parent) {
-  using paddle::framework::Scope;
-  using paddle::framework::Variable;
+  ss.NewVar("a");
 
-  auto parent_scope = std::make_shared<Scope>();
-  auto scope = std::make_shared<Scope>(parent_scope);
+  EXPECT_EQ(nullptr, s.FindVar("a"));
+  EXPECT_NE(nullptr, ss.FindVar("a"));
+}
 
-  Variable* var0 = parent_scope->CreateVariable("a");
-  EXPECT_NE(var0, nullptr);
+TEST(Scope, FindScope) {
+  Scope s;
+  Scope& ss = s.NewScope();
+  Variable* v = s.NewVar("a");
 
-  /// GetVariable will get Variable from parent scope if exist.
-  Variable* var1 = scope->GetVariable("a");
-  EXPECT_EQ(var0, var1);
+  EXPECT_EQ(&s, s.FindScope(v));
+  EXPECT_EQ(&s, ss.FindScope(v));
 }
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
index 72c4a7a2a1d1cf93a784f24e687727ee8481484c..38fc2720a3023039aa113b32a394bda9c5def4c0 100644
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -16,7 +16,7 @@
 #include <typeindex>
 #include <typeinfo>
 
-#include "paddle/platform/assert.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
@@ -25,7 +25,7 @@ class Variable {
  public:
   template <typename T>
   const T& Get() const {
-    PADDLE_ASSERT(IsType<T>());
+    PADDLE_ENFORCE(IsType<T>(), "Variable must be type %s", typeid(T).name());
     return *static_cast<const T*>(holder_->Ptr());
   }
 
@@ -65,6 +65,17 @@ class Variable {
 
   std::unique_ptr<Placeholder>
       holder_;  // pointers to a PlaceholderImpl object indeed.
+
+  // name_ is only meaningful with a Scope and accessible by it.
+  //
+  // NOTE: Please don't expose name_ by adding methods like
+  // Variable::Name or Scope::VarName!  A variable could have a human
+  // readable name or an auto-generated scope-unique name.  In the
+  // former case, the caller knows the name and doesn't need to access
+  // the name; in the latter case, the variable should be identified
+  // by its address but not the unreadable name.
+  friend class Scope;
+  const std::string* name_;
 };
 
 }  // namespace framework
diff --git a/paddle/gserver/layers/SliceProjection.cpp b/paddle/gserver/layers/SliceProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..267dd6154b1b21cc9b936384d438a2c3bdf0c246
--- /dev/null
+++ b/paddle/gserver/layers/SliceProjection.cpp
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * SliceProjection can slice the input value into multiple parts,
+ * and then select some of them to merge into a new output.
+ *
+ * First, calculate the slices that need to be merged into the output.
+ * slices = input.slices().for_output()
+ *
+ * Second, merge each slice into the output.
+ * for(auto slice: slices) {
+ *   out.addAtOffset(slice, offset);
+ * }
+ *
+ * Input slices as output: s0, s1, ...:
+ *   -----------------------
+ *   |///|   |//////|      |
+ *   |/s0|   |//s1//|      |
+ *   |///|   |//////|      |
+ *   -----------------------
+ * Output, merge s0, s1, ... into one output:
+ *   ----------------
+ *   |///|//////|   |
+ *   |/s0|//s1//|...|
+ *   |///|//////|   |
+ *   ----------------
+ *
+ * The config file api is slice_projection.
+ */
+class SliceProjection : public Projection {
+public:
+  SliceProjection(const ProjectionConfig& config,
+                  const ParameterPtr& parameter,
+                  bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  std::vector<std::pair<size_t, size_t>> slices_;
+};
+
+REGISTER_PROJECTION(slice, SliceProjection);
+
+/**
+ * Constructed function.
+ * @note SliceProjection should not have any parameter.
+ */
+SliceProjection::SliceProjection(const ProjectionConfig& config,
+                                 const ParameterPtr& parameter,
+                                 bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(!parameter) << "'slice' projection should not have any parameter";
+
+  slices_.reserve(config.slices_size());
+  for (const auto& slice : config.slices()) {
+    slices_.push_back(std::make_pair(slice.start(), slice.end()));
+  }
+}
+
+void SliceProjection::forward() {
+  size_t offset = 0;
+  for (auto& slice : slices_) {
+    auto slice_out = in_->value->subColMatrix(slice.first, slice.second);
+    out_->value->addAtOffset(*slice_out, offset);
+    offset += slice_out->getWidth();
+  }
+}
+
+void SliceProjection::backward(const UpdateCallback& callback) {
+  if (in_->grad) {
+    size_t offset = 0;
+    for (auto& slice : slices_) {
+      auto slice_out = in_->grad->subColMatrix(slice.first, slice.second);
+      slice_out->addAtOffset(*out_->grad, offset);
+      offset += slice_out->getWidth();
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/concat_slice_a.conf b/paddle/gserver/tests/concat_slice_a.conf
new file mode 100644
index 0000000000000000000000000000000000000000..dccf911089e16f4f97b1470ee39d192d4557d4bd
--- /dev/null
+++ b/paddle/gserver/tests/concat_slice_a.conf
@@ -0,0 +1,41 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+
+data = data_layer(name ="input", size=8*16*16)
+
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+proj1 = slice_projection(input=conv1, slices=[(0, 4), (4, 12)])
+
+proj2 = slice_projection(input=conv2, slices=[(1, 5), (5, 15)])
+
+concat = concat_layer(input=[proj1, proj2])
+
+outputs(concat)
+
diff --git a/paddle/gserver/tests/concat_slice_b.conf b/paddle/gserver/tests/concat_slice_b.conf
new file mode 100644
index 0000000000000000000000000000000000000000..29686ef2810370af3f84b60b2450d5c7d2e7663d
--- /dev/null
+++ b/paddle/gserver/tests/concat_slice_b.conf
@@ -0,0 +1,41 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+
+data = data_layer(name ="input", size=8*16*16)
+
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+proj1 = slice_projection(input=conv1, slices=[(0, 12)])
+
+proj2 = slice_projection(input=conv2, slices=[(1, 15)])
+
+concat = concat_layer(input=[proj1, proj2])
+
+outputs(concat)
+
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 0975c3bc9573c6ccb8f0ac98c41586d322d2465e..8ce8600c6743779899b2685c1c12053922265411 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -152,6 +152,26 @@ TEST(Projection, identity) {
   }
 }
 
+TEST(Projection, slice) {
+  ProjectionConfig conf;
+  conf.set_type("slice");
+  conf.set_input_size(100);
+  SliceConfig& slice1 = *conf.add_slices();
+  slice1.set_start(10);
+  slice1.set_end(20);
+  SliceConfig& slice2 = *conf.add_slices();
+  slice2.set_start(50);
+  slice2.set_end(70);
+  conf.set_output_size(30);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 10,
+                       useGpu);
+  }
+}
+
 TEST(Projection, scaling) {
   ProjectionConfig conf;
   conf.set_type("scaling");
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index 40e662b22bac0a2d22aea31fe99b11695bac3f57..f930c72fde3f5e0a6a45cb6bfd3507a4f48028fc 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -237,6 +237,12 @@ TEST(Compare, concat_table) {
   compareNetwork(config_file_a, config_file_b);
 }
 
+TEST(Compare, concat_slice) {
+  std::string config_file_a = "./gserver/tests/concat_slice_a.conf";
+  std::string config_file_b = "./gserver/tests/concat_slice_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
 #ifndef PADDLE_ONLY_CPU
 TEST(Compare, img_pool) {
   std::string config_file_a = "./gserver/tests/img_pool_a.conf";
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index c62c852b13dd2ea2c3d90053c0a6cd9be2687309..de318c7deca0f380aa69a6f8138edc0d85e05c2f 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -53,6 +53,11 @@ op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
 op_library(sigmoid_op SRCS sigmoid_op.cc sigmoid_op.cu)
 op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
 op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
+op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu)
+
+op_library(fc_op SRCS fc_op.cc DEPS mul_op rowwise_add_op sigmoid_op
+        softmax_op net)
+
 op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
 
 op_library(fc_op
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index 1424b0284372d8dfe9eb93ee251b121a48b19b0b..3a43dbfbada87e458109d8ca22effdb4407b4c1d 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -19,16 +19,16 @@ namespace operators {
 
 class AddOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 2, "Input size of AddOp must be two");
-    PADDLE_ENFORCE(outputs.size() == 1, "Output size of AddOp must be one");
-    PADDLE_ENFORCE(
-        inputs[0] != nullptr && inputs[1] != nullptr && outputs[0] != nullptr,
-        "Inputs/Outputs of AddOp must all be set");
-    PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of AddOp must be two");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one");
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.InputVar(1) != nullptr,
+                   "Inputs of AddOp must all be set");
+    PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr,
+                   "Outputs of AddOp must all be set");
+    PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims() == ctx.Input<Tensor>(1)->dims(),
                    "Two input of Add Op's dimension must be same.");
-    outputs[0]->Resize(inputs[0]->dims());
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
   }
 };
 
@@ -49,8 +49,7 @@ The equation is: Out = X + Y
 
 class AddOpGrad : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {}
+  void InferShape(const InferShapeContext &ctx) const override {}
   std::string DebugString() const override {
     LOG(INFO) << "AddOpGrad";
     return "";
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
index 0c39433788e1e07e30aaadc4766028219b05bfa5..d2b649fcbd1e5cac1c8cfcfd4e522e41135f7d1f 100644
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -21,16 +21,17 @@ namespace operators {
 template <typename Place, typename T>
 class AddKernel : public OpKernel {
 public:
-  void Compute(const KernelContext& context) const override {
-    auto input0 = context.Input(0)->Get<Tensor>();
-    auto input1 = context.Input(1)->Get<Tensor>();
-    auto output = context.Output(0)->GetMutable<Tensor>();
+  void Compute(const ExecutionContext& context) const override {
+    auto input0 = context.Input<Tensor>(0);
+    auto input1 = context.Input<Tensor>(1);
+    auto output = context.Output<Tensor>(0);
 
     output->mutable_data<T>(context.GetPlace());
 
     EigenVector<T>::Flatten(*output).device(
         *(context.GetEigenDevice<Place>())) =
-        EigenVector<T>::Flatten(input0) + EigenVector<T>::Flatten(input1);
+        framework::EigenVector<T>::Flatten(*input0) +
+        framework::EigenVector<T>::Flatten(*input1);
   }
 };
 
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 46c88d4d1a28eeedd02eb699562244651ead6d68..4f5b935fde4d5b0d9efae66554cf890291e26941 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -19,20 +19,20 @@ namespace operators {
 
 class OnehotCrossEntropyOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 2,
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2,
                    "Input size of OnehotCrossEntropyOp must be two");
-    PADDLE_ENFORCE(outputs.size() == 1,
+    PADDLE_ENFORCE(ctx.OutputSize() == 1,
                    "Output size of OnehotCrossEntropyOp must be one");
-    PADDLE_ENFORCE(inputs[0] != nullptr && inputs[1] != nullptr,
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.InputVar(1) != nullptr,
                    "Inputs of OnehotCrossEntropyOp must all be set");
-    PADDLE_ENFORCE(outputs[0] != nullptr,
+    PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr,
                    "Outputs of OnehotCrossEntropyOp must all be set");
-    PADDLE_ENFORCE(inputs[0]->dims().size() == 2, "X's dimension must be 2.");
-    PADDLE_ENFORCE(outputs[0]->dims().size() == 1,
+    PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims().size() == 2,
+                   "X's dimension must be 2.");
+    PADDLE_ENFORCE(ctx.Output<Tensor>(0)->dims().size() == 1,
                    "label's dimension must be 1.");
-    outputs[0]->Resize({inputs[0]->dims()[0]});
+    ctx.Output<Tensor>(0)->Resize({ctx.Input<Tensor>(0)->dims()[0]});
   }
 };
 
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 0383df46be3a3cea7dde8f1b45857e64d5a2f2d8..c3a3728149950a5c7f2195122e8e0ff728492bdb 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -23,18 +23,18 @@ class OnehotCrossEntropyOpKernel : public OpKernel {
 public:
   constexpr T LOG_THRESHOLD() const { return static_cast<T>(1e-20); }
 
-  void Compute(const KernelContext& context) const override {
-    auto X = context.Input(0)->Get<Tensor>();
-    const T* X_data = X.data<T>();
-    const int* label_data = context.Input(1)->Get<Tensor>().data<int>();
-    auto* Y = context.Output(0)->GetMutable<Tensor>();
+  void Compute(const ExecutionContext& ctx) const override {
+    auto X = ctx.Input<Tensor>(0);
+    const T* X_data = X->data<T>();
+    const int* label_data = ctx.Input<Tensor>(1)->data<int>();
+    auto Y = ctx.Output<Tensor>(0);
 
-    Y->mutable_data<T>(context.GetPlace());
+    Y->mutable_data<T>(ctx.GetPlace());
 
     T* Y_data = Y->data<T>();
 
-    int batch_size = X.dims()[0];
-    int class_num = X.dims()[1];
+    int batch_size = X->dims()[0];
+    int class_num = X->dims()[1];
 
     // Y[i] = -log(X[i][j])
     for (int i = 0; i < batch_size; ++i) {
diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc
index c4a9f5937f4fa8c60989bea1726cedbb73330156..71ceda958770796693265c08cb1fcae27e79bcd9 100644
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -50,8 +50,8 @@ public:
     AddInput("b", "the bias of fc operator");
 
     AddOutput("Y", "the output of fc operator");
-    AddOutput(
-        "before_act", "the before activation output of fc operator", true);
+    AddOutput("before_act", "the before activation output of fc operator")
+        .SetTemporary();
     AddAttr<std::string>("activation", "The activation key for fc layer")
         .SetDefault("sigmoid")
         .InEnum({"sigmoid", "softmax"});
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79a0e3d7e911b728a7a96ceff573976ba2b2e37f
--- /dev/null
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/fill_zeros_like_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+class FillZerosLikeOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 1UL,
+                   "Input size of FillZerosLikeOp must be one.");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1UL,
+                   "Output size of AddOp must be one.");
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr,
+                   "Input of FillZerosLikeOp must be set.");
+    PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr,
+                   "Output of FillZerosLikeOp must be set.");
+    ctx.Output<framework::Tensor>(0)->Resize(
+        ctx.Input<framework::Tensor>(0)->dims());
+  }
+};
+
+class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  FillZerosLikeOpMaker(framework::OpProto *proto,
+                       framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Src", "The input of fill-zeros-like op.");
+    AddOutput("Dst", "The varibale will be filled up with zeros.");
+    AddComment(R"DOC(
+Fill up a vriable with zeros.
+
+The output will have the same size with input.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(fill_zeros_like,
+            paddle::operators::FillZerosLikeOp,
+            paddle::operators::FillZerosLikeOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fill_zeros_like,
+    paddle::operators::FillZerosLikeKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/fill_zeros_like_op.cu b/paddle/operators/fill_zeros_like_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..55ad58f4f17cd4a3e737c01b001675d2690d273e
--- /dev/null
+++ b/paddle/operators/fill_zeros_like_op.cu
@@ -0,0 +1,6 @@
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/fill_zeros_like_op.h"
+
+REGISTER_OP_GPU_KERNEL(
+    fill_zeros_like,
+    paddle::operators::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..05272964abd43bdc2bd5c3cae8b128099e1c888c
--- /dev/null
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class FillZerosLikeKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* output = context.Output<framework::Tensor>(0);
+    output->mutable_data<T>(context.GetPlace());
+    framework::EigenVector<T>::Flatten(*output).setZero();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 22c1b78005358a934c57d487f5b0cff133f61f0c..d127f3a302a340fe7558f918d6eeb2ea0a3fafe7 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -19,18 +19,17 @@ namespace operators {
 
 class MulOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 2, "The mul op must take two inputs");
-    auto dim0 = inputs[0]->dims();
-    auto dim1 = inputs[1]->dims();
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2, "The mul op must take two inputs");
+    auto dim0 = ctx.Input<Tensor>(0)->dims();
+    auto dim1 = ctx.Input<Tensor>(1)->dims();
     PADDLE_ENFORCE(dim0.size() == 2 && dim1.size() == 2,
                    "The input of mul op must be matrix");
     PADDLE_ENFORCE(
         dim0[1] == dim1[0],
         "First matrix's width must be equal with second matrix's height.");
-    PADDLE_ENFORCE(outputs.size() == 1, "The mul op must take one output");
-    outputs[0]->Resize({dim0[0], dim1[1]});
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "The mul op must take one output");
+    ctx.Output<Tensor>(0)->Resize({dim0[0], dim1[1]});
   }
 };
 
@@ -51,8 +50,7 @@ The equation is: Out = X * Y
 
 class MulOpGrad : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {}
+  void InferShape(const InferShapeContext &ctx) const override {}
   std::string DebugString() const override {
     LOG(INFO) << "MulGrad";
     return "";
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index 467975044638a3f034ceec84173e8d3fed43cc0c..eef72ab293e13a9d05ce0013be41ec4bb75d6077 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -22,19 +22,17 @@ namespace operators {
 template <typename Place, typename T>
 class MulKernel : public OpKernel {
 public:
-  void Compute(const KernelContext& context) const override {
+  void Compute(const ExecutionContext& context) const override {
     Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
         {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};
 
-    auto input0 = context.Input(0)->Get<Tensor>();
-    auto input1 = context.Input(1)->Get<Tensor>();
-    auto* output = context.Output(0)->GetMutable<Tensor>();
-
+    auto output = context.Output<Tensor>(0);
     output->mutable_data<T>(context.GetPlace());
 
     EigenMatrix<T>::From(*output).device(*(context.GetEigenDevice<Place>())) =
-        EigenMatrix<T>::From(input0).contract(EigenMatrix<T>::From(input1),
-                                              dim_pair);
+        EigenMatrix<T>::From(*context.Input<Tensor>("X"))
+            .contract(EigenMatrix<T>::From(*context.Input<Tensor>("Y")),
+                      dim_pair);
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/recurrent_network_op.cc b/paddle/operators/recurrent_network_op.cc
index 1a101d6ddf149d608dbdbe048ef43d86bacbcc16..60d065fc4789f76370840328870165579aa73b67 100644
--- a/paddle/operators/recurrent_network_op.cc
+++ b/paddle/operators/recurrent_network_op.cc
@@ -27,38 +27,37 @@ namespace operators {
 
 namespace rnn {
 
-void SegmentInputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
+void SegmentInputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<Link>& inlinks,
                    const size_t seq_len) {
   PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
   for (size_t i = 0; i < inlinks.size(); ++i) {
     Tensor* input =
-        step_scopes[0]->GetVariable(inlinks[i].external)->GetMutable<Tensor>();
+        step_scopes[0]->FindVar(inlinks[i].external)->GetMutable<Tensor>();
     DDim dims = input->dims();
     PADDLE_ENFORCE(static_cast<size_t>(dims[0]) == seq_len,
                    "all the inlinks must have same length");
     DDim step_dims = slice_ddim(dims, 1, dims.size());
     for (size_t j = 0; j < seq_len; j++) {
-      Tensor* step_input = step_scopes[j]
-                               ->CreateVariable(inlinks[i].internal)
-                               ->GetMutable<Tensor>();
+      Tensor* step_input =
+          step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable<Tensor>();
       *step_input = input->Slice<float>(j, j + 1);
       step_input->Resize(step_dims);
     }
   }
 }
 
-void ConcatOutputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
+void ConcatOutputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<Link>& outlinks,
                    const size_t seq_len) {
   for (size_t i = 0; i < outlinks.size(); i++) {
     Tensor* output =
-        step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable<Tensor>();
+        step_scopes[0]->FindVar(outlinks[i].external)->GetMutable<Tensor>();
 
     // TODO(qingiqng) remove following code after adding
     // InferShape in RecurrentGradientOp
     DDim step_dims = step_scopes[0]
-                         ->GetVariable(outlinks[i].internal)
+                         ->FindVar(outlinks[i].internal)
                          ->GetMutable<Tensor>()
                          ->dims();
     std::vector<int> dims_vec = vectorize(step_dims);
@@ -66,9 +65,8 @@ void ConcatOutputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
     output->mutable_data<float>(make_ddim(dims_vec), platform::CPUPlace());
 
     for (size_t j = 0; j < seq_len; j++) {
-      Tensor* step_output = step_scopes[j]
-                                ->GetVariable(outlinks[i].internal)
-                                ->GetMutable<Tensor>();
+      Tensor* step_output =
+          step_scopes[j]->FindVar(outlinks[i].internal)->GetMutable<Tensor>();
       // TODO(luotao02) data type and platform::DeviceContext() should set
       // correctly
       (output->Slice<float>(j, j + 1))
@@ -77,7 +75,7 @@ void ConcatOutputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
   }
 }
 
-void LinkMemories(std::vector<std::shared_ptr<Scope>>& scopes,
+void LinkMemories(const std::vector<Scope*>& scopes,
                   const std::vector<rnn::MemoryAttr>& memories,
                   size_t step_id,
                   int offset) {
@@ -94,17 +92,17 @@ void LinkMemories(std::vector<std::shared_ptr<Scope>>& scopes,
                  offset,
                  scopes.size(),
                  step_id);
-  std::shared_ptr<Scope> scope = scopes[step_id];
-  std::shared_ptr<Scope> linked_scope = scopes[step_id + offset];
+  auto scope = scopes[step_id];
+  auto linked_scope = scopes[step_id + offset];
   for (auto& attr : memories) {
-    auto mem = scope->CreateVariable(attr.pre_var)->GetMutable<Tensor>();
+    auto mem = scope->NewVar(attr.pre_var)->GetMutable<Tensor>();
     // maybe share variable is better?
-    auto linked_mem = linked_scope->GetVariable(attr.var)->GetMutable<Tensor>();
+    auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<Tensor>();
     mem->ShareDataWith<float>(*linked_mem);
 
     // TODO(qingqing) remove following code
     // the memory of current step should be allocated in step net
-    auto m = scope->CreateVariable(attr.var)->GetMutable<Tensor>();
+    auto m = scope->NewVar(attr.var)->GetMutable<Tensor>();
     // for unit test, as addOp and mulOp are null currently, if not
     // mutable_data, mem.data() in output will be error. We will
     // remove this line after merge the correct addOp and mulOp.
@@ -171,8 +169,8 @@ void InitArgument(const ArgumentName& name,
 
 }  // namespace rnn
 
-void RecurrentAlgorithm::InferShape(const std::shared_ptr<Scope>& scope) const {
-  seq_len_ = scope->GetVariable((arg_->inlinks[0]).external)
+void RecurrentAlgorithm::InferShape(const Scope& scope) const {
+  seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
                  ->GetMutable<Tensor>()
                  ->dims()[0];
   CreateScopes(scope);
@@ -187,10 +185,10 @@ void RecurrentAlgorithm::InferShape(const std::shared_ptr<Scope>& scope) const {
 
   InitMemories(step_scopes[0]);
 
-  PADDLE_ENFORCE(scope->HasVariable(arg_->step_net),
+  PADDLE_ENFORCE(scope.FindVar(arg_->step_net) != nullptr,
                  "stepnet [%s] is not in scope.",
                  arg_->step_net);
-  Variable* net = scope->GetVariable(arg_->step_net);
+  Variable* net = scope.FindVar(arg_->step_net);
   PADDLE_ENFORCE(net != nullptr, "failed to get step net");
   // If the InferShape is called in OperatorBase's run function,
   // the rnn op only needs to do InferShape for the first time step
@@ -198,82 +196,79 @@ void RecurrentAlgorithm::InferShape(const std::shared_ptr<Scope>& scope) const {
     if (i > 0) {
       rnn::LinkMemories(step_scopes, arg_->memories, i, -1);
     }
-    net->GetMutable<NetOp>()->InferShape(step_scopes[i]);
+    net->GetMutable<NetOp>()->InferShape(*step_scopes[i]);
   }
 
   auto outlinks = arg_->outlinks;
   for (size_t i = 0; i < outlinks.size(); i++) {
     DDim step_dims = step_scopes[0]
-                         ->GetVariable(outlinks[i].internal)
+                         ->FindVar(outlinks[i].internal)
                          ->GetMutable<Tensor>()
                          ->dims();
     std::vector<int> dims_vec = vectorize(step_dims);
     // now only support fixed length
     dims_vec.insert(dims_vec.begin(), seq_len_);
     Tensor* output =
-        step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable<Tensor>();
+        step_scopes[0]->FindVar(outlinks[i].external)->GetMutable<Tensor>();
     output->Resize(make_ddim(dims_vec));
   }
 }
 
-void RecurrentAlgorithm::Run(const std::shared_ptr<Scope>& scope,
+void RecurrentAlgorithm::Run(const Scope& scope,
                              const platform::DeviceContext& dev_ctx) const {
   auto step_scopes = GetStepScopes(scope);
 
-  Variable* net = scope->GetVariable(arg_->step_net);
+  Variable* net = scope.FindVar(arg_->step_net);
   for (size_t step_id = 0; step_id < seq_len_; step_id++) {
     // the link memory is done in InferShape
     // maybe remove following code after testing
     if (step_id > 0) {
       rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1);
     }
-    net->GetMutable<NetOp>()->Run(step_scopes[step_id], dev_ctx);
+    net->GetMutable<NetOp>()->Run(*step_scopes[step_id], dev_ctx);
   }
 
   rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_);
 }
 
-void RecurrentAlgorithm::CreateScopes(std::shared_ptr<Scope> scope) const {
+void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
   // TODO(xxx) Only two scopes are needed for inference, this case will be
   // supported later.
-  auto step_scopes = scope->GetVariable(arg_->step_scopes)
-                         ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+  auto step_scopes =
+      scope.FindVar(arg_->step_scopes)->GetMutable<std::vector<Scope*>>();
 
   if (seq_len_ > step_scopes->size()) {
     for (size_t i = step_scopes->size(); i < seq_len_; ++i) {
-      std::shared_ptr<Scope> step_scope = std::make_shared<Scope>(scope);
+      auto& step_scope = scope.NewScope();
 
       // Now all variables in scope must be created outside of op.
-      auto net_op = scope->GetVariable(arg_->step_net)->GetMutable<NetOp>();
+      auto net_op = scope.FindVar(arg_->step_net)->GetMutable<NetOp>();
       for (auto& input : net_op->inputs_) {
-        step_scope->CreateVariable(input);
+        if (!step_scope.FindVar(input)) step_scope.NewVar(input);
       }
       for (auto& output : net_op->outputs_) {
-        step_scope->CreateVariable(output);
+        step_scope.NewVar(output);
       }
 
-      step_scopes->push_back(std::make_shared<Scope>(step_scope));
+      step_scopes->emplace_back(&step_scope);
     }
   }
 }
 
-void RecurrentAlgorithm::InitMemories(std::shared_ptr<Scope> step_scope) const {
+void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
   for (auto& attr : arg_->memories) {
-    Tensor* pre_mem =
-        step_scope->CreateVariable(attr.pre_var)->GetMutable<Tensor>();
-    PADDLE_ENFORCE(step_scope->HasVariable(attr.boot_var),
+    Tensor* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<Tensor>();
+    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                    "memory [%s]'s boot variable [%s] not exists",
                    attr.var,
                    attr.boot_var);
-    Tensor* boot_mem =
-        step_scope->GetVariable(attr.boot_var)->GetMutable<Tensor>();
+    Tensor* boot_mem = step_scope->FindVar(attr.boot_var)->GetMutable<Tensor>();
     pre_mem->ShareDataWith<float>(*boot_mem);
 
     // TODO(qingqing) remove following code
     // the memory of current step should be allocated in step net
     // here for unit test
-    auto cur_step_mem =
-        step_scope->CreateVariable(attr.var)->GetMutable<Tensor>();
+    auto cur_step_mem = step_scope->NewVar(attr.var)->GetMutable<Tensor>();
     cur_step_mem->mutable_data<float>(boot_mem->dims(), platform::CPUPlace());
   }
 }
@@ -312,13 +307,14 @@ public:
       : OpProtoAndCheckerMaker(proto, op_checker) {
     const auto& name = RecurrentOp::kArgName;
     // inputs and outputs stored in proto
-    AddInputs(name.inlinks,
-              "the input that need to be segmented for each step.");
-    AddInputs(name.boot_memories, "variables to initialize memories.");
+    AddInput(name.inlinks, "the input that need to be segmented for each step.")
+        .SetMultiple();
+    AddInput(name.boot_memories, "variables to initialize memories.")
+        .SetMultiple();
     AddInput(name.step_net, "network shared by all steps.");
 
-    AddOutputs(name.outlinks,
-               "the output that need to concated for all steps.");
+    AddOutput(name.outlinks, "the output that need to concated for all steps.")
+        .SetMultiple();
     AddOutput(name.step_scopes, "step scopes");
 
     // Attributes stored in AttributeMap
@@ -333,72 +329,69 @@ public:
 };
 
 void RecurrentGradientAlgorithm::Run(
-    const std::shared_ptr<Scope>& scope,
-    const platform::DeviceContext& dev_ctx) const {
+    const Scope& scope, const platform::DeviceContext& dev_ctx) const {
   auto step_scopes = GetStepScopes(scope);
   rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_);
-  PADDLE_ENFORCE(scope->HasVariable(arg_->step_net),
+  PADDLE_ENFORCE(scope.FindVar(arg_->step_net) != nullptr,
                  "step net is not in scope.");
-  Variable* net = scope->GetVariable(arg_->step_net);
+  Variable* net = scope.FindVar(arg_->step_net);
   PADDLE_ENFORCE(net != nullptr, "failed to get step net");
   for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
     if (static_cast<size_t>(step_id) != seq_len_ - 1) {
       rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1);
     }
-    net->GetMutable<NetOp>()->Run(step_scopes[step_id], dev_ctx);
+    net->GetMutable<NetOp>()->Run(*step_scopes[step_id], dev_ctx);
   }
   LinkBootMemoryGradients(step_scopes[0]);
   rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_);
 }
 
 void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
-    std::shared_ptr<Scope> step_scope) const {
+    Scope* step_scope) const {
   for (auto& attr : arg_->memories) {
-    Tensor* mem_grad =
-        step_scope->CreateVariable(attr.var)->GetMutable<Tensor>();
+    Tensor* mem_grad = step_scope->NewVar(attr.var)->GetMutable<Tensor>();
     PADDLE_ENFORCE(mem_grad != nullptr,
                    "boot_tensor should be retrieved before");
-    PADDLE_ENFORCE(step_scope->HasVariable(attr.boot_var),
+    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                    "memory [%s]'s boot variable [%s] not exists",
                    attr.var,
                    attr.boot_var);
     Tensor* boot_mem_grad =
-        step_scope->CreateVariable(attr.boot_var)->GetMutable<Tensor>();
+        step_scope->NewVar(attr.boot_var)->GetMutable<Tensor>();
     boot_mem_grad->ShareDataWith<float>(*mem_grad);
   }
 }
 
-void RecurrentGradientAlgorithm::InferShape(
-    const std::shared_ptr<Scope>& scope) const {
-  seq_len_ = scope->GetVariable((arg_->inlinks[0]).external)
+void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
+  seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
                  ->GetMutable<Tensor>()
                  ->dims()[0];
   auto step_scopes = GetStepScopes(scope);
   rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_);
 
-  PADDLE_ENFORCE(scope->HasVariable(arg_->step_net),
+  PADDLE_ENFORCE(scope.FindVar(arg_->step_net) != nullptr,
                  "step net is not in scope.");
-  Variable* net = scope->GetVariable(arg_->step_net);
+  Variable* net = scope.FindVar(arg_->step_net);
   PADDLE_ENFORCE(net != nullptr, "failed to get step net");
 
   for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
     if (static_cast<size_t>(step_id) != seq_len_ - 1) {
       rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1);
     }
-    net->GetMutable<NetOp>()->InferShape(step_scopes[step_id]);
+    net->GetMutable<NetOp>()->InferShape(*step_scopes[step_id]);
   }
 
   auto outlinks = arg_->outlinks;
   for (size_t i = 0; i < outlinks.size(); i++) {
     DDim step_dims = step_scopes[0]
-                         ->GetVariable(outlinks[i].internal)
+                         ->FindVar(outlinks[i].internal)
                          ->GetMutable<Tensor>()
                          ->dims();
     std::vector<int> dims_vec = vectorize(step_dims);
     // now only support fixed length
     dims_vec.insert(dims_vec.begin(), seq_len_);
     Tensor* output =
-        step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable<Tensor>();
+        step_scopes[0]->FindVar(outlinks[i].external)->GetMutable<Tensor>();
     output->Resize(make_ddim(dims_vec));
   }
   LinkBootMemoryGradients(step_scopes[0]);
diff --git a/paddle/operators/recurrent_network_op.h b/paddle/operators/recurrent_network_op.h
index 8946c8ce38117c391edcf56558c640ebd0d7f75c..d57a1a2e51cbed22549ab6ebce79223e2d4e3bcf 100644
--- a/paddle/operators/recurrent_network_op.h
+++ b/paddle/operators/recurrent_network_op.h
@@ -70,18 +70,18 @@ struct ArgumentName {
 /**
  * Prepare inputs for each step net.
  */
-void SegmentInputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
+void SegmentInputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<Link>& inlinks,
                    const size_t seq_len);
 
 /**
  * Process outputs of step nets and merge to variables.
  */
-void ConcatOutputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
+void ConcatOutputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<Link>& outlinks,
                    const size_t seq_len);
 
-void LinkMemories(std::vector<std::shared_ptr<Scope>>& step_scopes,
+void LinkMemories(const std::vector<Scope*>& step_scopes,
                   const std::vector<MemoryAttr>& memories,
                   size_t step_id,
                   int offset);
@@ -100,15 +100,14 @@ void InitArgument(const ArgumentName& name, Argument* arg);
 
 class RecurrentAlgorithm {
 public:
-  void Run(const std::shared_ptr<Scope>& scope,
-           const platform::DeviceContext& dev_ctx) const;
+  void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const;
 
   void Init(std::unique_ptr<rnn::Argument> arg) { arg_ = std::move(arg); }
 
   /**
    * InferShape must be called before Run.
    */
-  void InferShape(const std::shared_ptr<Scope>& scope) const;
+  void InferShape(const Scope& scope) const;
 
 protected:
   /*
@@ -117,15 +116,13 @@ protected:
    * NOTE the scopes are reused in both the forward and backward, so just
    * create once and expand its size if more steps need.
    */
-  void CreateScopes(std::shared_ptr<Scope> scope) const;
+  void CreateScopes(const Scope& scope) const;
 
-  inline const std::vector<std::shared_ptr<Scope>>& GetStepScopes(
-      std::shared_ptr<Scope> scope) const {
-    return *(scope->GetVariable(arg_->step_scopes))
-                ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+  const std::vector<Scope*>& GetStepScopes(const Scope& scope) const {
+    return *scope.FindVar(arg_->step_scopes)->GetMutable<std::vector<Scope*>>();
   }
 
-  void InitMemories(std::shared_ptr<Scope> step_scopes) const;
+  void InitMemories(Scope* step_scopes) const;
 
 private:
   std::unique_ptr<rnn::Argument> arg_;
@@ -146,21 +143,18 @@ class RecurrentGradientAlgorithm {
 public:
   void Init(std::unique_ptr<rnn::Argument> arg) { arg_ = std::move(arg); }
 
-  void Run(const std::shared_ptr<Scope>& scope,
-           const platform::DeviceContext& dev_ctx) const;
+  void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const;
 
-  void LinkBootMemoryGradients(std::shared_ptr<Scope> step_scopes) const;
+  void LinkBootMemoryGradients(Scope* step_scopes) const;
 
   /**
    * InferShape must be called before Run.
    */
-  void InferShape(const std::shared_ptr<Scope>& scope) const;
+  void InferShape(const Scope& scope) const;
 
 protected:
-  inline const std::vector<std::shared_ptr<Scope>>& GetStepScopes(
-      std::shared_ptr<Scope> scope) const {
-    return *(scope->GetVariable(arg_->step_scopes))
-                ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+  inline const std::vector<Scope*>& GetStepScopes(const Scope& scope) const {
+    return *scope.FindVar(arg_->step_scopes)->GetMutable<std::vector<Scope*>>();
   }
 
 private:
@@ -175,11 +169,11 @@ public:
   /**
    * InferShape must be called before Run.
    */
-  virtual void InferShape(const std::shared_ptr<Scope>& scope) const override {
+  virtual void InferShape(const Scope& scope) const override {
     alg_.InferShape(scope);
   }
 
-  virtual void Run(const std::shared_ptr<Scope>& scope,
+  virtual void Run(const Scope& scope,
                    const platform::DeviceContext& dev_ctx) const override {
     alg_.Run(scope, dev_ctx);
   }
@@ -197,11 +191,11 @@ public:
   /**
    * InferShape must be called before Run.
    */
-  virtual void InferShape(const std::shared_ptr<Scope>& scope) const override {
+  virtual void InferShape(const Scope& scope) const override {
     alg_.InferShape(scope);
   }
 
-  virtual void Run(const std::shared_ptr<Scope>& scope,
+  virtual void Run(const Scope& scope,
                    const platform::DeviceContext& dev_ctx) const override {
     alg_.Run(scope, dev_ctx);
   }
diff --git a/paddle/operators/recurrent_network_op_test.cc b/paddle/operators/recurrent_network_op_test.cc
index 6784ac6001ad1b464d65814cff1ad6247826ad66..b0e61fbee611744adb85b498b1c3540f059afc8c 100644
--- a/paddle/operators/recurrent_network_op_test.cc
+++ b/paddle/operators/recurrent_network_op_test.cc
@@ -34,41 +34,40 @@ protected:
   virtual void TearDown() override {}
 
   void CreateGlobalVariables() {
-    scope_ = std::make_shared<Scope>();
     // create input, and init content
     LOG(INFO) << "create global variable x";
     for (auto inlink : std::vector<std::string>{"x", "x0", "x1", "h"}) {
-      Variable* x = scope_->CreateVariable(inlink);
+      Variable* x = scope_.NewVar(inlink);
       DDim dims = make_ddim(std::vector<int>{
           10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/});
       x->GetMutable<Tensor>()->mutable_data<float>(dims, platform::CPUPlace());
     }
     // create output alias just for test
     for (auto inlink : std::vector<std::string>{"h@alias"}) {
-      Variable* x = scope_->CreateVariable(inlink);
+      Variable* x = scope_.NewVar(inlink);
       DDim dims =
           make_ddim(std::vector<int>{20 /*batch size*/, 30 /*input dim*/});
       x->GetMutable<Tensor>()->mutable_data<float>(dims, platform::CPUPlace());
     }
 
     LOG(INFO) << "create global variable w";
-    Variable* w = scope_->CreateVariable("rnn/w");
+    Variable* w = scope_.NewVar("rnn/w");
     w->GetMutable<Tensor>()->mutable_data<float>(
         make_ddim(std::vector<int>{30, 30}), platform::CPUPlace());
 
     for (auto boot : std::vector<std::string>{"x_boot", "h_boot"}) {
       LOG(INFO) << "create global variable " << boot;
-      Variable* h_boot = scope_->CreateVariable(boot);
+      Variable* h_boot = scope_.NewVar(boot);
       h_boot->GetMutable<Tensor>()->mutable_data<float>(
           make_ddim(std::vector<int>{20 /*batch size*/, 30 /*input dim*/}),
           platform::CPUPlace());
     }
 
     LOG(INFO) << "create variable step_scopes";
-    scope_->CreateVariable("step_scopes");
+    scope_.NewVar("step_scopes");
 
     LOG(INFO) << "create variable h";
-    scope_->CreateVariable("h");
+    scope_.NewVar("h");
   }
 
   void CreateRNNOp() {
@@ -150,7 +149,7 @@ protected:
 
   void CreateStepNet() {
     LOG(INFO) << "create variable step_net";
-    Variable* var = scope_->CreateVariable("step_net");
+    Variable* var = scope_.NewVar("step_net");
     auto net = var->GetMutable<NetOp>();
     // rnn/s is net's input or output?
     net->inputs_ = {"rnn/h@pre", "rnn/w", "rnn/x"};
@@ -164,7 +163,7 @@ protected:
   }
 
   // father scope
-  std::shared_ptr<Scope> scope_;
+  Scope scope_;
   std::shared_ptr<OperatorBase> rnn_op_;
 };
 
@@ -191,68 +190,64 @@ protected:
   virtual void TearDown() override {}
 
   void CreateGlobalVariables() {
-    scope_ = std::make_shared<Scope>();
     // inputs: x
     LOG(INFO) << "create global variable x";
-    Variable* x = scope_->CreateVariable("x");
+    Variable* x = scope_.NewVar("x");
     DDim dims =
         make_ddim({10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/});
     x->GetMutable<Tensor>()->mutable_data<float>(dims, platform::CPUPlace());
     // inputs: h_boot
     LOG(INFO) << "create global variable h_boot";
-    Variable* h_boot = scope_->CreateVariable("h_boot");
+    Variable* h_boot = scope_.NewVar("h_boot");
     h_boot->GetMutable<Tensor>()->mutable_data<float>(
         make_ddim({20 /*batch size*/, 30 /*input dim*/}), platform::CPUPlace());
     // inputs: w
     LOG(INFO) << "create global variable w";
-    Variable* w = scope_->CreateVariable("rnn/w");
+    Variable* w = scope_.NewVar("rnn/w");
     w->GetMutable<Tensor>()->mutable_data<float>(make_ddim({30, 30}),
                                                  platform::CPUPlace());
     // inputs: h_grad
     LOG(INFO) << "create variable h_grad";
-    Variable* dh = scope_->CreateVariable("h_grad");
+    Variable* dh = scope_.NewVar("h_grad");
     dh->GetMutable<Tensor>()->mutable_data<float>(make_ddim({10, 20, 30}),
                                                   platform::CPUPlace());
     // inputs: step_scopes
     LOG(INFO) << "create variable step_scopes";
-    scope_->CreateVariable("step_scopes");
+    scope_.NewVar("step_scopes");
     // inputs: step_net
     LOG(INFO) << "create variable step_net";
-    scope_->CreateVariable("step_net");
+    scope_.NewVar("step_net");
     // outputs: w_grad
     LOG(INFO) << "create global variable w_grad";
-    scope_->CreateVariable("rnn/w_grad");
+    scope_.NewVar("rnn/w_grad");
     // outputs: x_grad
     LOG(INFO) << "create global variable x_grad";
-    scope_->CreateVariable("x_grad");
+    scope_.NewVar("x_grad");
     // outputs: h_boot_grad
     LOG(INFO) << "create global variable h_boot_grad";
-    scope_->CreateVariable("h_boot_grad");
+    scope_.NewVar("h_boot_grad");
   }
 
   void CreateStepScopes() {
-    std::vector<std::shared_ptr<Scope>>* step_scopes =
-        scope_->GetVariable("step_scopes")
-            ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+    auto step_scopes =
+        scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>();
     for (int i = 0; i < 10; ++i) {
-      auto scope = std::make_shared<Scope>(scope_);
-      auto pre_t = scope->CreateVariable("rnn/pre_h")->GetMutable<Tensor>();
-      pre_t->mutable_data<float>(make_ddim({20, 30}), platform::CPUPlace());
-      auto tensor = scope->CreateVariable("rnn/h")->GetMutable<Tensor>();
-      tensor->mutable_data<float>(make_ddim({20, 30}), platform::CPUPlace());
+      auto& scope = scope_.NewScope();
+      auto pre_t = scope.NewVar("rnn/pre_h")->GetMutable<Tensor>();
+      pre_t->mutable_data<float>({20, 30}, platform::CPUPlace());
+      auto tensor = scope.NewVar("rnn/h")->GetMutable<Tensor>();
+      tensor->mutable_data<float>({20, 30}, platform::CPUPlace());
 
       // for unit test of ConcatOutputs
-      auto xg = scope->CreateVariable("rnn/x_grad")->GetMutable<Tensor>();
-      xg->mutable_data<float>(make_ddim({20, 30}), platform::CPUPlace());
+      auto xg = scope.NewVar("rnn/x_grad")->GetMutable<Tensor>();
+      xg->mutable_data<float>({20, 30}, platform::CPUPlace());
 
-      step_scopes->push_back(scope);
+      step_scopes->emplace_back(&scope);
     }
 
     // last time step
-    auto g = (*step_scopes)[9]
-                 ->CreateVariable("rnn/h_pre_grad")
-                 ->GetMutable<Tensor>();
-    g->mutable_data<float>(make_ddim({20, 30}), platform::CPUPlace());
+    auto g = (*step_scopes)[9]->NewVar("rnn/h_pre_grad")->GetMutable<Tensor>();
+    g->mutable_data<float>({20, 30}, platform::CPUPlace());
   }
 
   void CreateRNNGradientAlgorithm() {
@@ -280,7 +275,7 @@ protected:
 
   void CreateStepNet() {
     LOG(INFO) << "create variable step_net";
-    Variable* var = scope_->CreateVariable("step_net");
+    Variable* var = scope_.NewVar("step_net");
     auto net = var->GetMutable<NetOp>();
     net->AddOp(OpRegistry::CreateOp("mul",
                                     {"rnn/h_pre", "rnn/w", "rnn/s_grad"},
@@ -300,9 +295,8 @@ protected:
     rnn::Link inlink;
     inlink.external = "x";
     inlink.internal = "rnn/x";
-    std::vector<std::shared_ptr<Scope>>* step_scopes =
-        scope_->GetVariable("step_scopes")
-            ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+    auto step_scopes =
+        scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>();
     rnn::SegmentInputs(*step_scopes, std::vector<rnn::Link>{inlink}, 10);
   }
 
@@ -314,15 +308,14 @@ protected:
     mem_attr.boot_var = "boot_h";
     std::vector<rnn::MemoryAttr> memories;
     memories.push_back(mem_attr);
-    std::vector<std::shared_ptr<Scope>>* step_scopes =
-        scope_->GetVariable("step_scopes")
-            ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+    auto step_scopes =
+        scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>();
     for (int i = 1; i < 10; ++i) {
       rnn::LinkMemories(*step_scopes, memories, i, -1);
     }
   }
 
-  std::shared_ptr<Scope> scope_;
+  Scope scope_;
   RecurrentGradientAlgorithm rnn_grad_algo_;
 };
 
@@ -341,14 +334,14 @@ TEST(RecurrentOp, LinkMemories) {
 
   // create and init step scopes
   int len = 10;
-  std::vector<std::shared_ptr<Scope>> step_scopes;
+  std::vector<Scope*> step_scopes;
   for (int i = 0; i < len; ++i) {
-    auto scope = std::make_shared<Scope>();
-    scope->CreateVariable("pre_h");
-    auto tensor = scope->CreateVariable("h")->GetMutable<Tensor>();
-    float* data = tensor->mutable_data<float>(make_ddim({15, 20}), CPUPlace());
-    for (int i = 0; i < 15 * 20; ++i) {
-      data[i] = rand() * (1. / (double)RAND_MAX);
+    auto scope = new Scope();
+    scope->NewVar("pre_h");
+    auto tensor = scope->NewVar("h")->GetMutable<Tensor>();
+    float* data = tensor->mutable_data<float>({15, 20}, CPUPlace());
+    for (int j = 0; j < 15 * 20; ++j) {
+      data[j] = rand() * (1. / (double)RAND_MAX);
     }
     step_scopes.push_back(scope);
   }
@@ -367,9 +360,9 @@ TEST(RecurrentOp, LinkMemories) {
   // check
   for (int i = 0; i < len - 1; ++i) {
     const float* a =
-        step_scopes[i]->GetVariable("h")->GetMutable<Tensor>()->data<float>();
+        step_scopes[i]->FindVar("h")->GetMutable<Tensor>()->data<float>();
     const float* b = step_scopes[i + 1]
-                         ->GetVariable("pre_h")
+                         ->FindVar("pre_h")
                          ->GetMutable<Tensor>()
                          ->data<float>();
     for (size_t i = 0; i < 15 * 20; ++i) {
@@ -382,19 +375,25 @@ TEST(RecurrentOp, LinkMemories) {
   }
   // check
   for (int i = len - 2; i >= 0; --i) {
-    const float* a = step_scopes[i]
-                         ->GetVariable("pre_h")
-                         ->GetMutable<Tensor>()
-                         ->data<float>();
-    const float* b = step_scopes[i + 1]
-                         ->GetVariable("h")
-                         ->GetMutable<Tensor>()
-                         ->data<float>();
+    const float* a =
+        step_scopes[i]->FindVar("pre_h")->GetMutable<Tensor>()->data<float>();
+    const float* b =
+        step_scopes[i + 1]->FindVar("h")->GetMutable<Tensor>()->data<float>();
     for (size_t i = 0; i < 15 * 20; ++i) {
       ASSERT_FLOAT_EQ(a[i], b[i]);
     }
   }
+
+  for (auto s : step_scopes) {
+    delete s;
+  }
 }
 
 USE_OP(add_two);
 USE_OP(mul);
+
+// int main() {
+//  //! TODO(yuyang18): Temporary disable this unit-test because implementation
+//  //! error.
+//  return 0;
+//}
\ No newline at end of file
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 4129422fa744b2a7cf135b681efa73ffb2ebcdcc..2ad2b66c8f385c858eb34c7ea766f168de9c817e 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -18,17 +18,17 @@ namespace operators {
 
 class RowWiseAddOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 2UL, "Two inputs is needed by rowwise add");
-    auto dim0 = inputs[0]->dims();
-    auto dim1 = inputs[1]->dims();
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2UL,
+                   "Two inputs is needed by rowwise add");
+    auto dim0 = ctx.Input<Tensor>(0)->dims();
+    auto dim1 = ctx.Input<Tensor>(1)->dims();
 
     PADDLE_ENFORCE(dim0.size() == 2, "Input 0 must be matrix");
     PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
     PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
-    PADDLE_ENFORCE(outputs.size() == 1, "The output size must be 1");
-    outputs[0]->Resize(inputs[0]->dims());
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "The output size must be 1");
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
   }
 };
 
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index 4596925e9322f373c822608fd9aa6ecee6144d4c..b86dd5463436bf521f9939b1c421b39f11102769 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -21,14 +21,12 @@ namespace operators {
 template <typename Place, typename T>
 class RowWiseAddKernel : public OpKernel {
 public:
-  void Compute(const KernelContext& context) const override {
-    auto in0 = context.Input(0)->Get<Tensor>();
-    auto in1 = context.Input(1)->Get<Tensor>();
-    auto* out = context.Output(0)->GetMutable<Tensor>();
+  void Compute(const ExecutionContext& context) const override {
+    auto out = context.Output<Tensor>(0);
     out->mutable_data<T>(context.GetPlace());
 
-    auto input = EigenMatrix<T>::From(in0);
-    auto bias = EigenVector<T>::From(in1);
+    auto input = EigenMatrix<T>::From(*context.Input<Tensor>(0));
+    auto bias = EigenVector<T>::From(*context.Input<Tensor>(1));
     auto output = EigenMatrix<T>::From(*out);
 
     const int bias_size = bias.dimension(0);
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index f6c654a9e7083704e353c276e0abc975f4e61ef9..9a84dc8af3b3e649b776ca8a97dedba1fa3ff48d 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -19,16 +19,15 @@ namespace operators {
 
 class SGDOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 2, "Input size of SGDOp must be two");
-    PADDLE_ENFORCE(outputs.size() == 1, "Output size of SGDOp must be one");
-    PADDLE_ENFORCE(inputs[0] != nullptr, "inputs[0] mast be set");
-    PADDLE_ENFORCE(inputs[1] != nullptr, "inputs[1] mast be set");
-    PADDLE_ENFORCE(outputs[0] != nullptr, "outputs[0] mast be set");
-    PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of SGDOp must be two");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of SGDOp must be one");
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr, "inputs[0] mast be set");
+    PADDLE_ENFORCE(ctx.InputVar(1) != nullptr, "inputs[1] mast be set");
+    PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr, "outputs[0] mast be set");
+    PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims() == ctx.Input<Tensor>(1)->dims(),
                    "Two input of SGD Op's dimension must be same.");
-    outputs[0]->Resize(inputs[0]->dims());
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
   }
 };
 
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index 65179d323bd991b8b4e196c069a11cd901c62082..af1dfdd756ceb9991bee6b85c3281c05f0fb5a9f 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -21,16 +21,16 @@ namespace operators {
 template <typename Place, typename T>
 class SGDOpKernel : public OpKernel {
 public:
-  void Compute(const KernelContext& ctx) const override {
-    auto param = ctx.Input("param")->Get<Tensor>();
-    auto grad = ctx.Input("grad")->Get<Tensor>();
-    auto* param_out = ctx.Output(0)->GetMutable<Tensor>();
+  void Compute(const ExecutionContext& ctx) const override {
+    auto param = ctx.Input<Tensor>("param");
+    auto grad = ctx.Input<Tensor>("grad");
+    auto param_out = ctx.Output<Tensor>(0);
     float lr = ctx.op_.GetAttr<float>("learning_rate");
 
     param_out->mutable_data<T>(ctx.GetPlace());
 
     EigenVector<T>::Flatten(*param_out).device(*(ctx.GetEigenDevice<Place>())) =
-        EigenVector<T>::Flatten(param) - lr * EigenVector<T>::Flatten(grad);
+        EigenVector<T>::Flatten(*param) - lr * EigenVector<T>::Flatten(*grad);
   }
 };
 
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index 716f1d9c4dbc45e2d5569f8d634b06fd988a149c..a81ab262cc6fe7bdff0045259e0030f3d46f503f 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -18,11 +18,10 @@ namespace operators {
 
 class SigmoidOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input");
-    PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output");
-    outputs[0]->Resize(inputs[0]->dims());
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 1, "Sigmoid Op only have one input");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Sigmoid Op only have one output");
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
   }
 };
 
@@ -38,8 +37,7 @@ public:
 
 class SigmoidOpGrad : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {}
+  void InferShape(const InferShapeContext &ctx) const override {}
   std::string DebugString() const override {
     LOG(INFO) << "SigmoidGrad";
     return "";
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
index 896a6f5d83e0f96de50e3aaae6f545172bf5da14..3dd23a9ebc7ac0972d6ee07b9ac051d59e66f62f 100644
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -22,15 +22,14 @@ namespace operators {
 template <typename Place, typename T>
 class SigmoidKernel : public OpKernel {
 public:
-  void Compute(const KernelContext& context) const override {
-    auto input = context.Input(0)->Get<Tensor>();
-    auto* output = context.Output(0)->GetMutable<Tensor>();
-
+  void Compute(const ExecutionContext& context) const override {
+    auto input = context.Input<Tensor>(0);
+    auto output = context.Output<Tensor>(0);
     output->mutable_data<T>(context.GetPlace());
 
     EigenVector<T>::Flatten(*output).device(
         *(context.GetEigenDevice<Place>())) =
-        1.0 / (1.0 + (-1.0 * EigenVector<T>::Flatten(input)).exp());
+        1.0 / (1.0 + (-1.0 * EigenVector<T>::Flatten(*input)).exp());
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index df60b62fa6ac8d67c9dadc40ec49aaedab92bc88..5b59fad7d5f9729b0862f8cd78cb32f94f87f513 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -18,14 +18,13 @@ namespace operators {
 
 class SoftmaxOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
-    PADDLE_ENFORCE(inputs[0]->dims().size() == 2,
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 1, "Only one input is need for softmax");
+    PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims().size() == 2,
                    "The input of softmax op must be matrix");
-    PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax");
-
-    outputs[0]->Resize(inputs[0]->dims());
+    PADDLE_ENFORCE(ctx.OutputSize() == 1,
+                   "Only one output is need for softmax");
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
   }
 };
 
@@ -41,8 +40,7 @@ public:
 
 class SoftmaxOpGrad : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {}
+  void InferShape(const InferShapeContext &ctx) const override {}
   std::string DebugString() const override {
     LOG(INFO) << "SoftmaxOpGrad";
     return "";
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 625a87b58560231572c1cca2a21bd0c47c8cb296..a5c19c5fc7c6f5909dbb355aff09bf15405b6957 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -22,12 +22,12 @@ namespace operators {
 template <typename Place, typename T>
 class SoftmaxKernel : public OpKernel {
 public:
-  void Compute(const KernelContext& context) const override {
-    auto input = context.Input(0)->Get<Tensor>();
-    auto* output = context.Output(0)->GetMutable<Tensor>();
+  void Compute(const ExecutionContext& context) const override {
+    auto input = context.Input<Tensor>(0);
+    auto output = context.Output<Tensor>(0);
     output->mutable_data<T>(context.GetPlace());
 
-    auto logits = EigenMatrix<T>::From(input);
+    auto logits = EigenMatrix<T>::From(*input);
     auto softmax = EigenMatrix<T>::From(*output);
 
     const int kBatchDim = 0;
diff --git a/paddle/operators/type_alias.h b/paddle/operators/type_alias.h
index b712e457ff60e8b30b87c0d549693d53e9f05d59..93b62cddc819e0d1fd48323e474a294ff0d327e1 100644
--- a/paddle/operators/type_alias.h
+++ b/paddle/operators/type_alias.h
@@ -22,7 +22,13 @@ namespace paddle {
 namespace operators {
 
 using OpKernel = framework::OpKernel;
-using KernelContext = framework::KernelContext;
+using InferShapeContext = framework::InferShapeContext;
+using ExecutionContext = framework::ExecutionContext;
+using Variable = framework::Variable;
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index fd4adbd9deca12ad6c3a59cfd5d30fb0cb6fcf98..26c8eb78e614a68ec9728aad727d8fe3e08547ae 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <execinfo.h>
 #include <paddle/string/printf.h>
+#include <iomanip>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -39,12 +41,22 @@ namespace platform {
 struct EnforceNotMet : public std::exception {
   std::exception_ptr exp_;
   std::string err_str_;
-
   EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) {
+    static constexpr int TRACE_STACK_LIMIT = 100;
     try {
       std::rethrow_exception(exp_);
     } catch (const std::exception& exp) {
-      err_str_ = string::Sprintf("%s at [%s:%d]", exp.what(), f, l);
+      std::ostringstream sout;
+      sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
+      sout << "Call Stacks: " << std::endl;
+      void* call_stack[TRACE_STACK_LIMIT];
+      int sz = backtrace(call_stack, TRACE_STACK_LIMIT);
+      auto line = backtrace_symbols(call_stack, sz);
+      for (int i = 0; i < sz; ++i) {
+        sout << line[i] << std::endl;
+      }
+      free(line);
+      err_str_ = sout.str();
     }
   }
 
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 4fa481bedf5a0a9b8474c01ebe06ee132ea66a91..801ef50e577d563f4534f33e49aa7b72ab840d89 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -103,15 +103,18 @@ All parameter, weight, gradient are variables in Paddle.
            },
            py::return_value_policy::reference);
 
-  py::class_<pd::Scope, std::shared_ptr<pd::Scope>>(m, "Scope")
-      .def(py::init<const std::shared_ptr<pd::Scope>&>())
-      .def("get_var",
-           &pd::Scope::GetVariable,
+  py::class_<pd::Scope>(m, "Scope", "")
+      .def("new_var",
+           [](pd::Scope& self, const std::string& name) -> pd::Variable* {
+             return self.NewVar(name);
+           },
            py::return_value_policy::reference)
-      .def("create_var",
-           &pd::Scope::CreateVariable,
+      .def("find_var", &pd::Scope::FindVar, py::return_value_policy::reference)
+      .def(py::init<>())
+      .def("new_scope",
+           [](pd::Scope& self) -> pd::Scope* { return &self.NewScope(); },
            py::return_value_policy::reference)
-      .def("get_var_name", &pd::Scope::GetVariableName);
+      .def("drop_kids", &pd::Scope::DropKids);
 
   //! @note: Be careful! PyBind will return std::string as an unicode, not
   //! Python str. If you want a str object, you should cast them in Python.
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 83f72c137bdf5e55f28be908321bd2ccd6c906fe..3bee5b572ae42750332b69e28af980ae325532da 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -198,6 +198,11 @@ message RowConvConfig {
   required uint32 context_length = 1;
 }
 
+message SliceConfig {
+  required uint32 start = 1;
+  required uint32 end = 2;
+}
+
 message ProjectionConfig {
   required string type = 1;
   required string name = 2;
@@ -218,6 +223,10 @@ message ProjectionConfig {
 
   // For pool
   optional PoolConfig pool_conf = 12;
+
+  // For slice
+  // Each slice output is the input[start, end)
+  repeated SliceConfig slices = 13;
 }
 
 message OperatorConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 5477158ecb8646992ebdded0b15cce50720ebf36..f71fefffb59d4a53dda092ff83a61d9eec4b601f 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -565,6 +565,35 @@ class IdentityOffsetProjection(Projection):
         return []
 
 
+@config_class
+class SliceProjection(Projection):
+    type = 'slice'
+
+    def __init__(self, input_layer_name, slices, **xargs):
+        super(SliceProjection, self).__init__(input_layer_name, **xargs)
+        input = g_layer_map[input_layer_name]
+        if input.type in ["exconv", "cudnn_conv"]:
+            # the slice operator is for the channel dimension
+            assert input.num_filters is not None
+            channels = input.num_filters
+            image_size = input.size / channels
+            assert slices[len(slices) - 1][1] <= channels
+            for i in xrange(len(slices)):
+                slice = self.proj_conf.slices.add()
+                slice.start = slices[i][0] * image_size
+                slice.end = slices[i][1] * image_size
+                self.size += slice.end - slice.start
+        else:
+            config_assert(False,
+                          'Currently the input should be convolution layer')
+
+    def calc_parameter_size(self, input_size, output_size):
+        return 0
+
+    def calc_parameter_dims(self, input_size, output_size):
+        return []
+
+
 # DotMulProjection performs element-wise multiplication with weight
 @config_class
 class DotMulProjection(Projection):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 14f072fc55109d770edf469ad7c574b8dda8a434..965874ddf632a83d00065c2d40037930a6e604a8 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -128,6 +128,7 @@ __all__ = [
     'prelu_layer',
     'gated_unit_layer',
     'crop_layer',
+    'slice_projection',
 ]
 
 
@@ -536,6 +537,45 @@ def identity_projection(input, offset=None, size=None):
     return proj
 
 
+def slice_projection(input, slices):
+    """
+    slice_projection can slice the input value into multiple parts,
+    and then select some of them to merge into a new output.
+
+    .. math::
+       output = [input.slices()]
+
+    The example usage is:
+
+    .. code-block:: python
+
+       proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)])
+
+    Note that slice_projection should not have any parameter.
+
+    :param input: Input Layer.
+    :type input: LayerOutput
+    :param slices: An array of slice parameters.
+                   Each slice contains the start and end offsets based
+                   on the input.
+    :type slices: pair of int
+    :return: A SliceProjection object
+    :rtype: SliceProjection
+    """
+    assert len(slices) >= 1
+    start = 0
+    for i in xrange(len(slices)):
+        assert len(slices[i]) == 2
+        # The start position of the next slice needs to be greater than
+        # or equal to the end position of the previous slice.
+        assert slices[i][0] >= start
+        assert slices[i][1] >= slices[i][0]
+        start = slices[i][1]
+    proj = SliceProjection(input_layer_name=input.name, slices=slices)
+    proj.origin = input
+    return proj
+
+
 @wrap_param_attr_default()
 def scaling_projection(input, param_attr=None):
     """
diff --git a/python/paddle/v2/framework/default_scope_funcs.py b/python/paddle/v2/framework/default_scope_funcs.py
index 4e772326c94b7ee44906c71f13e9420e078a1917..1b5580c8b30f69016f187b1d8710a57b5f7cfa9f 100644
--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/framework/default_scope_funcs.py
@@ -5,7 +5,7 @@ Default scope function.
 thread-local stack of Scope. Top of that stack is current scope, the bottom 
 of that stack is all scopes' parent. 
 
-Invoking `create_var/get_var`  can `create/get` variable in current scope. 
+Invoking `new_var/find_var`  can `new/find` variable in current scope. 
 Invoking `enter_local_scope/leave_local_scope` can create or destroy local 
 scope. 
 
@@ -19,8 +19,8 @@ import threading
 __tl_scope__ = threading.local()
 
 __all__ = [
-    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'create_var',
-    'get_var', 'scoped_function'
+    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'new_var',
+    'find_var', 'scoped_function'
 ]
 
 
@@ -33,7 +33,7 @@ def get_cur_scope():
     if cur_scope_stack is None:
         __tl_scope__.cur_scope = list()
     if len(__tl_scope__.cur_scope) == 0:
-        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope(None))
+        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope())
     return __tl_scope__.cur_scope[-1]
 
 
@@ -42,7 +42,7 @@ def enter_local_scope():
     Enter a new local scope
     """
     cur_scope = get_cur_scope()
-    new_scope = paddle.v2.framework.core.Scope(cur_scope)
+    new_scope = cur_scope.new_scope()
     __tl_scope__.cur_scope.append(new_scope)
 
 
@@ -51,20 +51,21 @@ def leave_local_scope():
     Leave local scope
     """
     __tl_scope__.cur_scope.pop()
+    get_cur_scope().drop_kids()
 
 
-def create_var(name):
+def new_var(name):
     """
     create variable in current scope.
     """
-    return get_cur_scope().create_var(name)
+    return get_cur_scope().new_var(name)
 
 
-def get_var(name):
+def find_var(name):
     """
     get variable in current scope.
     """
-    return get_cur_scope().get_var(name)
+    return get_cur_scope().find_var(name)
 
 
 def scoped_function(func):
diff --git a/python/paddle/v2/framework/network.py b/python/paddle/v2/framework/network.py
index c85e87413ef45f40755709e134a277b8d8d1e233..cfeb0e3dec0fd2c6ad4d2d2501f97932495fdd41 100644
--- a/python/paddle/v2/framework/network.py
+++ b/python/paddle/v2/framework/network.py
@@ -1,6 +1,6 @@
 import paddle.v2.framework.core as core
 from paddle.v2.framework.create_op_creation_methods import op_creations
-from default_scope_funcs import create_var, get_var, get_cur_scope
+from default_scope_funcs import new_var, find_var, get_cur_scope
 
 __all__ = ['Network']  # Only expose Network
 
@@ -29,12 +29,15 @@ class NetworkFunctor(object):
             if ipt in kwargs:
                 var = kwargs[ipt]
                 if isinstance(var, basestring):
-                    var = create_var(var)
+                    tmp = new_var(var)
+                    self.net.var_names[tmp] = var
+                    var = tmp
+
                 if not isinstance(var, core.Variable):
                     raise TypeError(
                         "Input of op creation must be string or variable")
 
-                kwargs[ipt] = get_cur_scope().get_var_name(var)
+                kwargs[ipt] = self.net.var_names[var]
 
         notemp_outputs = self.func.all_not_temp_output_args
 
@@ -49,17 +52,20 @@ class NetworkFunctor(object):
             if opt in kwargs:
                 var = kwargs[opt]
                 if isinstance(var, basestring):
-                    var = create_var(var)
+                    tmp = new_var(var)
+                    self.net.var_names[tmp] = var
+                    var = tmp
+
                 if not isinstance(var, core.Variable):
                     raise TypeError(
                         "Output of op creation must be string or variable")
-                kwargs[opt] = get_cur_scope().get_var_name(var)
+                kwargs[opt] = self.net.var_names[var]
 
         op = self.func(**kwargs)
 
         self.net.net.add_op(op)
 
-        lst = [get_var(kwargs[opt]) for opt in notemp_outputs]
+        lst = [find_var(kwargs[opt]) for opt in notemp_outputs]
         if len(lst) == 1:
             return lst[0]
         elif len(lst) == 0:
@@ -89,6 +95,7 @@ class Network(object):
         self.net = core.Net.create()
         funcs = (func_name for func_name in dir(op_creations)
                  if not func_name.startswith("__"))
+        self.var_names = dict()
 
         # TODO(yuyang18): This code can work, but do not generate a good
         # docstring, try to give a better way generate function in runtime
diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
index 7b62313f8aca5e9f515d1a9e6df3bb6f51b974fb..99085c367221150c8386a24e8d90d58fd63894c4 100644
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -24,13 +24,13 @@ class OpTestMeta(type):
             func = getattr(creation.op_creations, self.type, None)
             self.assertIsNotNone(func)
 
-            scope = core.Scope(None)
+            scope = core.Scope()
             kwargs = dict()
 
             for in_name in func.all_input_args:
                 if hasattr(self, in_name):
                     kwargs[in_name] = in_name
-                    var = scope.create_var(in_name).get_tensor()
+                    var = scope.new_var(in_name).get_tensor()
                     arr = getattr(self, in_name)
                     var.set_dims(arr.shape)
                     var.set(arr)
@@ -40,7 +40,7 @@ class OpTestMeta(type):
             for out_name in func.all_output_args:
                 if hasattr(self, out_name):
                     kwargs[out_name] = out_name
-                    scope.create_var(out_name).get_tensor()
+                    scope.new_var(out_name).get_tensor()
 
             for attr_name in func.all_attr_args:
                 if hasattr(self, attr_name):
@@ -54,7 +54,7 @@ class OpTestMeta(type):
             op.run(scope, ctx)
 
             for out_name in func.all_output_args:
-                actual = numpy.array(scope.get_var(out_name).get_tensor())
+                actual = numpy.array(scope.find_var(out_name).get_tensor())
                 expect = getattr(self, out_name)
                 # TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul
                 # has some diff, and could not pass unittest. So I set decimal 3 here.
diff --git a/python/paddle/v2/framework/tests/test_default_scope_funcs.py b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
index 81033deb1546c81e2566ec5474f45dc56781644a..495863c4562b5a2d6755fb02e21a6b0c845fd7b6 100644
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
@@ -7,19 +7,19 @@ class TestDefaultScopeFuncs(unittest.TestCase):
         self.assertIsNotNone(get_cur_scope())
 
     def test_none_variable(self):
-        self.assertIsNone(get_var("test"))
+        self.assertIsNone(find_var("test"))
 
     def test_create_var_get_var(self):
-        var_a = create_var("var_a")
+        var_a = new_var("var_a")
         self.assertIsNotNone(var_a)
-        self.assertIsNotNone(get_cur_scope().get_var('var_a'))
+        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
         enter_local_scope()
-        self.assertIsNotNone(get_cur_scope().get_var('var_a'))
+        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
         leave_local_scope()
 
     def test_var_get_int(self):
         def __new_scope__():
-            i = create_var("var_i")
+            i = new_var("var_i")
             self.assertFalse(i.is_int())
             i.set_int(10)
             self.assertTrue(i.is_int())
diff --git a/python/paddle/v2/framework/tests/test_fc_op.py b/python/paddle/v2/framework/tests/test_fc_op.py
index 59e7e61249e2a7d49a17e5d87209f03b8f35f730..43931aac406cd93beede008066aa1c0c00eba6ea 100644
--- a/python/paddle/v2/framework/tests/test_fc_op.py
+++ b/python/paddle/v2/framework/tests/test_fc_op.py
@@ -6,13 +6,13 @@ import paddle.v2.framework.create_op_creation_methods as creation
 
 class TestFc(unittest.TestCase):
     def test_fc(self):
-        scope = core.Scope(None)
-        x = scope.create_var("X")
+        scope = core.Scope()
+        x = scope.new_var("X")
         x_tensor = x.get_tensor()
         x_tensor.set_dims([1000, 784])
         x_tensor.alloc_float()
 
-        w = scope.create_var("W")
+        w = scope.new_var("W")
         w_tensor = w.get_tensor()
         w_tensor.set_dims([784, 100])
         w_tensor.alloc_float()
@@ -25,10 +25,10 @@ class TestFc(unittest.TestCase):
         op = creation.op_creations.fc(X="X", Y="Y", W="W")
 
         for out in op.outputs():
-            if scope.get_var(out) is None:
-                scope.create_var(out).get_tensor()
+            if scope.find_var(out) is None:
+                scope.new_var(out).get_tensor()
 
-        tensor = scope.get_var("Y").get_tensor()
+        tensor = scope.find_var("Y").get_tensor()
         op.infer_shape(scope)
         self.assertEqual([1000, 100], tensor.shape())
 
diff --git a/python/paddle/v2/framework/tests/test_scope.py b/python/paddle/v2/framework/tests/test_scope.py
index f0ee45cfc75e486c693a00d92a97ac0970195581..1ce9454067f91f39f01d9eb4c912857464a3c1cb 100644
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
@@ -5,29 +5,29 @@ import unittest
 class TestScope(unittest.TestCase):
     def test_create_destroy(self):
         paddle_c = paddle.v2.framework.core
-        scope = paddle_c.Scope(None)
+        scope = paddle_c.Scope()
         self.assertIsNotNone(scope)
-        scope_with_parent = paddle_c.Scope(scope)
+        scope_with_parent = scope.new_scope()
         self.assertIsNotNone(scope_with_parent)
 
     def test_none_variable(self):
         paddle_c = paddle.v2.framework.core
-        scope = paddle_c.Scope(None)
-        self.assertIsNone(scope.get_var("test"))
+        scope = paddle_c.Scope()
+        self.assertIsNone(scope.find_var("test"))
 
     def test_create_var_get_var(self):
         paddle_c = paddle.v2.framework.core
-        scope = paddle_c.Scope(None)
-        var_a = scope.create_var("var_a")
+        scope = paddle_c.Scope()
+        var_a = scope.new_var("var_a")
         self.assertIsNotNone(var_a)
-        self.assertIsNotNone(scope.get_var('var_a'))
-        scope2 = paddle_c.Scope(scope)
-        self.assertIsNotNone(scope2.get_var('var_a'))
+        self.assertIsNotNone(scope.find_var('var_a'))
+        scope2 = scope.new_scope()
+        self.assertIsNotNone(scope2.find_var('var_a'))
 
     def test_var_get_int(self):
         paddle_c = paddle.v2.framework.core
-        scope = paddle_c.Scope(None)
-        var = scope.create_var("test_int")
+        scope = paddle_c.Scope()
+        var = scope.new_var("test_int")
         var.set_int(10)
         self.assertTrue(var.is_int())
         self.assertEqual(10, var.get_int())
diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/framework/tests/test_tensor.py
index b72aff3b9cd16595c7e81856642196b2bb61a790..6d59863cea29832f648139e07a134050e22bfa21 100644
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
@@ -5,8 +5,8 @@ import numpy
 
 class TestScope(unittest.TestCase):
     def test_int_tensor(self):
-        scope = core.Scope(None)
-        var = scope.create_var("test_tensor")
+        scope = core.Scope()
+        var = scope.new_var("test_tensor")
         tensor = var.get_tensor()
 
         tensor.set_dims([1000, 784])
@@ -23,8 +23,8 @@ class TestScope(unittest.TestCase):
         self.assertEqual(2.0, tensor_array_2[19, 11])
 
     def test_float_tensor(self):
-        scope = core.Scope(None)
-        var = scope.create_var("test_tensor")
+        scope = core.Scope()
+        var = scope.new_var("test_tensor")
         tensor = var.get_tensor()
 
         tensor.set_dims([1000, 784])
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
index b658a81630733fea3976b812afe819d76de4cb25..fc718f031e2267e737adbc340226e145bf614bf2 100644
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -76,3 +76,6 @@ class client(object):
         # Memory created from C should be freed.
         get_c_lib().mem_free(ret.contents)
         return record, 0
+
+    def paddle_start_get_records(self, pass_id):
+        get_c_lib().paddle_start_get_records(self.c, pass_id)
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 55a0fcdf56af7a8c9bee3255ea6f1d1ae1b34893..d0f18e4b6611fa56654e7f2a0144758339cb9e19 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could
 be used in user program.
 """
 
-__all__ = ['np_array', 'text_file', "recordio"]
+__all__ = ['np_array', 'text_file', "cloud_reader"]
 
 
 def np_array(x):
@@ -81,35 +81,41 @@ def recordio_local(paths, buf_size=100):
     return dec.buffered(reader, buf_size)
 
 
-def recordio(paths, buf_size=100):
+pass_num = 0
+
+
+def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
     """
-    Creates a data reader that outputs record one one by one
-        from given local or cloud recordio path.
+    Create a data reader that yield a record one by one from
+        the paths:
     :path: path of recordio files.
+    :etcd_endpoints: the endpoints for etcd cluster
     :returns: data reader of recordio files.
+
+    ..  code-block:: python
+        from paddle.v2.reader.creator import cloud_reader
+        etcd_endpoints = "http://127.0.0.1:2379"
+        trainer.train.(
+            reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints),
+        )
     """
     import os
-    import paddle.v2.master.client as cloud
-
-    if "KUBERNETES_SERVICE_HOST" not in os.environ.keys():
-        return recordio_local(paths)
-
-    host_name = "MASTER_SERVICE_HOST"
-    if host_name not in os.environ.keys():
-        raise Exception('not find ' + host_name + ' in environment variable.')
-
-    addr = os.environ(host)
+    import cPickle as pickle
+    import paddle.v2.master as master
+    c = master.client(etcd_endpoints, timeout_sec, buf_size)
+    c.set_dataset(paths)
 
     def reader():
-        c = cloud(addr, buf_size)
-        c.set_dataset(paths)
+        global pass_num
+        c.paddle_start_get_records(pass_num)
+        pass_num += 1
 
         while True:
-            r, err = client.next_record()
-            if err < 0:
+            r, e = c.next_record()
+            if not r:
+                if e != -2:
+                    print "get record error: ", e
                 break
-            yield r
-
-        c.release()
+            yield pickle.loads(r)
 
     return reader
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
index b42d273ecfe6c4bc5706ec52617960b83496d70d..359f3eeefbe8efeb343cc875c707c9251a7087fb 100644
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -34,14 +34,5 @@ class TestTextFile(unittest.TestCase):
             self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
 
 
-class TestRecordIO(unittest.TestCase):
-    def test_recordio(self):
-        path = os.path.join(
-            os.path.dirname(__file__), "test_recordio_creator.dat")
-        reader = paddle.v2.reader.creator.recordio([path])
-        for idx, r in enumerate(reader()):
-            self.assertSequenceEqual(r, str(idx))
-
-
 if __name__ == '__main__':
     unittest.main()