Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into add-ClipLayer

82911cec · guosheng · d19355a5 · 7e60706b · 82911cec · 82911cec
80 changed file
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -198,6 +198,10 @@ identity_projection
 ..  autoclass:: paddle.v2.layer.identity_projection
    :noindex:

+slice_projection
+-------------------
+..  autoclass:: paddle.v2.layer.slice_projection
+    :noindex:

 table_projection
 ----------------

--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -37,8 +37,8 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 ```cpp
 class Scope {
 public:
-  Variable* CreateVariable(const std::string& name);
-  const Variable* GetVariable(const std::string& name) const;
+  Variable* NewVar(const std::string& name);
+  const Variable* FindVar(const std::string& name) const;

 private:
    std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
@@ -58,12 +58,12 @@ class Scope {
 public:
  Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}

-  Variable* GetVariable(const std::string& name) const {
+  Variable* FindVar(const std::string& name) const {
    auto it = vars_.find(name);
    if (it != vars_.end()) {
      return it->second.get();
    } else if (parent_ != nullptr) {
-      return parent_->GetVariable(name);
+      return parent_->FindVar(name);
    } else {
      return nullptr;
    }
@@ -95,10 +95,10 @@ class Scope {
  static std::shared_ptr<Scope> Create(const std::shared_ptr<Scope>& parent = nullptr);

  // return nullptr if not found.
-  Variable* GetVariable(const std::string& name) const;
+  Variable* FindVar(const std::string& name) const;

  // return if already contains same name variable.
-  Variable* CreateVariable(const std::string& name);
+  Variable* NewVar(const std::string& name);

 private:
  std::shared_ptr<Scope> parent_;
@@ -107,11 +107,11 @@ class Scope {
 ```
 ## Only scope can create a variable

-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `CreateVariable` can construct `Variable`.
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`.

 ## When scope destroyed, all variables inside this scope should be destroyed together

-The scope hold unique pointers for all variables. User can `GetVariable` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
+The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.

 ## Sharing a parent scope

@@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar

 ## Orthogonal interface

-`GetVariable` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `CreateVariable` will return a `Error` when there is a name conflict locally. Combine `GetVariable` and `CreateVariable`, we can implement `CreateOrGetVariable` easily.
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return a `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -3,24 +3,11 @@ import paddle.v2.dataset.uci_housing as uci_housing
 import paddle.v2.master as master
 import os
 import cPickle as pickle
+from paddle.v2.reader.creator import cloud_reader

 etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
-etcd_endpoint = "http://" + etcd_ip + ":2379"
-print "connecting to master, etcd endpoints: ", etcd_endpoint
-master_client = master.client(etcd_endpoint, 5, 64)
-
-
-def cloud_reader():
-    global master_client
-    master_client.set_dataset(
-        ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*"], passes=30)
-    while 1:
-        r, e = master_client.next_record()
-        if not r:
-            if e != -2:  # other errors
-                print "get record error:", e
-            break
-        yield pickle.loads(r)
+etcd_endpoints = "http://" + etcd_ip + ":2379"
+print "etcd endpoints: ", etcd_endpoints


 def main():
@@ -49,7 +36,7 @@ def main():
                                 parameters=parameters,
                                 update_equation=optimizer,
                                 is_local=False,
-                                 pserver_spec=etcd_endpoint,
+                                 pserver_spec=etcd_endpoints,
                                 use_etcd=True)

    # event_handler to print training and testing info
@@ -75,7 +62,11 @@ def main():
    trainer.train(
        reader=paddle.batch(
            paddle.reader.shuffle(
-                cloud_reader, buf_size=500), batch_size=2),
+                cloud_reader(
+                    ["/pfs/dlnel/public/dataset/uci_housing/uci_housing*"],
+                    etcd_endpoints),
+                buf_size=500),
+            batch_size=2),
        feeding={'x': 0,
                 'y': 1},
        event_handler=event_handler,

--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -269,8 +269,7 @@ void hl_sequence2batch_copy_padding(real* batch,
  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
  dim3 threads(blockDimX, blockDimY);

-  int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) /
-      CUDA_BLOCK_SIZE;
+  int gridDimX = (maxSequenceLength + blockDimY - 1) / blockDimY;
  int gridDimY = numSequences;
  dim3 grid(gridDimX, gridDimY);


--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -8,7 +8,9 @@ cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

 cc_test(variable_test SRCS variable_test.cc)
-cc_test(scope_test SRCS scope_test.cc)
+
+cc_library(scope SRCS scope.cc)
+cc_test(scope_test SRCS scope_test.cc DEPS scope)

 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
@@ -16,7 +18,7 @@ proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)

-cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
+cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)

 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
@@ -30,4 +32,7 @@ add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch
 add_dependencies(framework_py_proto framework_py_proto_init)

 cc_library(net SRCS net.cc DEPS op_registry)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op softmax_op fc_op)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net)
+
+cc_library(backward SRCS backward.cc DEPS net)
+cc_test(backward_test SRCS backward_test.cc DEPS backward)
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/backward.h"
+#include <list>
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+static bool AllInSet(const std::vector<std::string>& names,
+                     const std::string& suffix,
+                     const std::unordered_set<std::string>& set) {
+  for (auto& name : names) {
+    if (set.find(name + suffix) == set.end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static std::shared_ptr<OperatorBase> NOP() {
+  auto net_op = std::make_shared<NetOp>();
+  net_op->type_ = "@NOP@";
+  net_op->CompleteAddOp();
+  return net_op;
+}
+
+//  Get backward operator from a forward operator, recursively implementation.
+//
+//  no_grad_names the gradient variable names without gradient calculating.
+//
+//  uniq_id is a unique index used inside recursively calling BackwardRecursive.
+//  use `uid = uniq_id++;` to get the unique index, and pass `uniq_id` through
+//  recursive calling.
+//
+//  returns The backward operator. For simple situation, it is a simple
+//  operator. For complex situation, it is a NetOp.
+//
+//  See Backward.h for details
+static std::shared_ptr<OperatorBase> BackwardRecursive(
+    const OperatorBase& forwardOp,
+    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id);
+std::shared_ptr<OperatorBase> BackwardRecursive(
+    const OperatorBase& forwardOp,
+    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id) {
+  //  If all input gradients of forwarding operator do not need to calculate,
+  //  just return an NOP. Not return null ptr because NOP does not take
+  //  too much time for calculation, but it is useful for simplifying logic.
+  if (AllInSet(forwardOp.inputs_, OperatorBase::GRAD_VAR_SUFFIX(),
+               no_grad_names)) {
+    return NOP();
+  }
+
+  //  All output gradients of forwarding operator do not need to calculate. Then
+  //  all input gradients cannot be computed at all, and we put them into
+  //  `no_grad_names` set. Return an NOP.
+  if (AllInSet(forwardOp.outputs_, OperatorBase::GRAD_VAR_SUFFIX(),
+               no_grad_names)) {
+    for (auto& name : forwardOp.inputs_) {
+      // Mark all input is not need
+      no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX());
+    }
+    return NOP();
+  }
+
+  // Returned gradient network
+  auto net = std::make_shared<NetOp>();
+
+  if (forwardOp.IsNetOp()) {
+    // Because forwardOp is a net op, it can static_cast.
+    auto& forwardNet = static_cast<const NetOp&>(forwardOp);
+
+    // Map from output gradient variable name to operator's indices in backward
+    // net. That operator generates that variable.
+    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
+
+    size_t local_op_id = 0;
+    // reversely travel forwardNet
+    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
+         ++it, ++local_op_id) {
+      auto fwd = *it;
+      auto bwd = BackwardRecursive(*fwd, no_grad_names, uniq_id);
+      net->AddOp(bwd);
+      for (auto& out : bwd->outputs_) {
+        dup_output_ops[out].emplace_back(local_op_id);
+      }
+    }
+    // Get unique ID for this method.
+    auto uid = uniq_id++;
+    // TODO(dzh): more comment
+    using Pos = std::pair<size_t, std::shared_ptr<OperatorBase>>;
+    std::list<Pos> insert_position;
+    for (auto& dup_output_op : dup_output_ops) {
+      const std::string& name = dup_output_op.first;
+      auto& dup_op = dup_output_op.second;
+      if (dup_op.size() == 1) continue;
+      std::vector<std::string> dup_outputs;
+
+      for (size_t i = 0; i < dup_op.size(); ++i) {
+        auto op_offset = dup_op[i];
+        dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
+                              std::to_string(i));
+        net->ops_[op_offset]->Rename(name, dup_outputs.back());
+      }
+      insert_position.push_back(
+          {dup_op.back(),
+           OpRegistry::CreateOp(
+               "add", {dup_outputs}, {name},
+               {{"input_format",
+                 std::vector<int>{0, static_cast<int>(dup_outputs.size())}}})});
+    }
+
+    insert_position.sort(
+        [](const Pos& l, const Pos& r) { return l.first > r.first; });
+
+    for (auto& pos : insert_position) {
+      net->InsertOp(pos.first + 1, pos.second);
+    }
+
+  } else {
+    std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
+    for (std::string& grad_input : grad_op->inputs_) {
+      if (no_grad_names.count(grad_input)) {
+        std::string prefix = grad_input.substr(
+            0, grad_input.size() - OperatorBase::GRAD_VAR_SUFFIX().size());
+        grad_input = prefix + OperatorBase::ZERO_VAR_SUFFIX();
+
+        // If part of input gradient of that operator is not calculated, fill
+        // zero variables to that input gradient.
+        net->AddOp(OpRegistry::CreateOp("fill_zeros_like", {prefix},
+                                        {grad_input}, {}));
+      }
+    }
+
+    for (std::string& grad_output : grad_op->outputs_) {
+      if (no_grad_names.count(grad_output)) {
+        grad_output = OperatorBase::EMPTY_VAR_NAME();
+      }
+    }
+
+    if (net->ops_.empty()) {  // Current no aux op is added to network
+      return grad_op;
+    }
+    net->AddOp(grad_op);
+  }
+  net->type_ = "@GENERATED_BACKWARD@";
+  net->CompleteAddOp();
+  return net;
+}
+
+// See header for comments
+std::shared_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars) {
+  std::unordered_set<std::string> no_grad_names;
+  no_grad_names.reserve(no_grad_vars.size());
+
+  for (auto& name : no_grad_vars) {
+    no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX());
+  }
+  size_t uid = 0;
+  return BackwardRecursive(forwardOp, no_grad_names, uid);
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <unordered_set>
+#include "operator.h"
+namespace paddle {
+namespace framework {
+
+// Create the backward operator from a forward operator.
+// TODO(yuyang18): Add more API reference comment.
+extern std::shared_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars);
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
+## Operator/expression 's Backward
+
+### Motivation
+
+In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation lineage, the operator/ expression's Backward feature will generate the backward pass respect to forward pass.
+
+### Implement : gradient operator registry
+
+|                        | forward operator | backward operator                |
+| ---------------------- | ---------------- | -------------------------------- |
+| **Operator::inputs_**  | Inputs           | Inputs, Outputs, OutputGradients |
+| **Operator::outputs_** | Outputs          | InputGradients                   |
+
+Inputs/Outputs means the input/output of the operator,  InputGradients/OutputGradients is the gradient respect to forward opeartor. Forward operator and Backward operator are isomorphic, save their corresponding needs into member attribute.
+
+We use a global hash map record the gradient operators available, follow the philosophy  of minimum core, make operator pluggable unit. Each gradient is an operator and it needs to regist itself. 
+
+grad_op_builder(fengjiayi)
+
+### Implement : Backward network
+
+given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
+
+1. bla bla bla (yuyang)
+
+2. NetOp 
+
+   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively and ensure them done. During the process, we need to collect the `OutputGradients` name.
+
+   We share variable in the same scope, as a result, duplicate operator `OutputGradients` will overwirte then duplicate variable.  
+
+   ![./images/duplicate_op]()
+
+    Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator instead. 
+
+![./images/duplicate_op2]()
+
+	Then collect the sub graph OutputGradients/InputGradients as the NetOp's and return it.
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/backward.h"
+
+#include <gtest/gtest.h>
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+class EmptyOp : public OperatorBase {
+ public:
+  void InferShape(const Scope &scope) const override {}
+  void Run(const Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {}
+};
+
+class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input X of Add").IgnoreGradient();
+    AddInput("b", "Bias of Add").IgnoreGradient();
+    AddOutput("Out", "Out of Add").IgnoreGradient();
+    AddComment("Add Op");
+  }
+};
+
+class MulOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("A", "A");
+    AddInput("B", "B");
+    AddOutput("Out", "Out");
+    AddComment("Mul");
+  }
+};
+
+class SigmoidOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "X");
+    AddOutput("Y", "Y");
+    AddComment("Sigmoid");
+  }
+};
+
+class NoGradOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "X input");
+    AddOutput("Y", "Y output");
+    AddComment("NoGradOp, same input output. no Grad");
+  }
+};
+
+class FcOp : public NetOp {
+ public:
+  void Init() override {
+    AddOp(OpRegistry::CreateOp("mul", {Input("X"), Input("W")},
+                               {Output("mul_result")}, {}));
+    auto b_name = Input("b");
+    std::string before_act = "mul_result";
+    if (b_name != EMPTY_VAR_NAME()) {
+      AddOp(OpRegistry::CreateOp("rowwise_add", {Output("mul_result"), b_name},
+                                 {Output("add_result")}, {}));
+      before_act = "add_result";
+    } else {
+      auto out_varname = Output("add_result");
+      if (out_varname != EMPTY_VAR_NAME()) {
+        this->Rename(out_varname, EMPTY_VAR_NAME());
+      }
+    }
+
+    AddOp(OpRegistry::CreateOp("sigmoid", {Output(before_act)}, {Output("Out")},
+                               {}));
+    CompleteAddOp(false);
+  }
+};
+
+class FcOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  FcOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x");
+    AddInput("W", "w");
+    AddInput("b", "b");
+    AddOutput("mul_result", "").SetTemporary();
+    AddOutput("add_result", "").SetTemporary();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class ManyOutputOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "x");
+    AddOutput("y", "y");
+    AddOutput("z", "z");
+    AddComment("");
+  }
+};
+
+class FillZeroOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "x");
+    AddOutput("out", "out");
+    AddComment("");
+  }
+};
+
+class AddOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x").SetMultiple();
+    AddOutput("Y", "y");
+    AddComment("");
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+namespace f = paddle::framework;
+using EnforceNotMet = paddle::platform::EnforceNotMet;
+REGISTER_OP(rowwise_add, f::EmptyOp, f::RowWiseAddOpMaker);
+REGISTER_GRADIENT_OP(rowwise_add, rowwise_add_grad, f::EmptyOp);
+REGISTER_OP(mul, f::EmptyOp, f::MulOpMaker);
+REGISTER_GRADIENT_OP(mul, mul_grad, f::EmptyOp);
+REGISTER_OP(sigmoid, f::EmptyOp, f::SigmoidOpMaker);
+REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, f::EmptyOp);
+REGISTER_OP(nograd, f::EmptyOp, f::NoGradOpMaker);
+REGISTER_OP(fill_zeros_like, f::EmptyOp, f::FillZeroOpMaker);
+REGISTER_OP(add, f::EmptyOp, f::AddOpMaker);
+REGISTER_GRADIENT_OP(add, add_grad, f::EmptyOp);
+REGISTER_OP(fc, f::FcOp, f::FcOpMaker);
+REGISTER_OP(many_output_op, f::EmptyOp, f::ManyOutputOpMaker);
+REGISTER_GRADIENT_OP(many_output_op, many_output_op_grad, f::EmptyOp);
+
+TEST(Backward, simple_op_grad) {
+  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
+  ASSERT_NE(fwd, nullptr);
+  auto gop = f::OpRegistry::CreateGradOp(*fwd);
+  ASSERT_EQ(1UL, gop->inputs_.size());
+  ASSERT_EQ("Out" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->inputs_[0]);
+  ASSERT_EQ("rowwise_add_grad", gop->type_);
+  ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[0]);
+  ASSERT_EQ("b" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[1]);
+
+  ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+            gop->Output("X" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+}
+
+TEST(Backward, simple_op_not_need_grad) {
+  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
+  ASSERT_NE(fwd, nullptr);
+  auto gop = f::Backward(*fwd, {"X"});
+  ASSERT_EQ(std::find(gop->outputs_.begin(), gop->outputs_.end(),
+                      "X" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            gop->outputs_.end());
+
+  auto no_input_gop = f::Backward(*fwd, {"X", "b"});
+  ASSERT_NE(no_input_gop, nullptr);
+  ASSERT_TRUE(no_input_gop->IsNetOp());
+  ASSERT_EQ(0UL, std::static_pointer_cast<f::NetOp>(no_input_gop)->ops_.size());
+}
+
+TEST(Backward, net_fc_backward_normal) {
+  std::shared_ptr<f::OperatorBase> fwd = f::OpRegistry::CreateOp(
+      "fc", {"X", "w", "b"}, {"mul_result", "add_result", "out"}, {});
+  ASSERT_NE(fwd, nullptr);
+  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  ASSERT_TRUE(gop->IsNetOp());
+  auto net = static_cast<f::NetOp *>(gop.get());
+
+  ASSERT_NO_THROW(net->DebugString());
+
+  ASSERT_EQ(3UL, net->ops_.size());
+
+  f::OperatorBase &d_sigmoid = *net->ops_[0];
+  ASSERT_EQ("sigmoid_grad", d_sigmoid.type_);
+
+  f::OperatorBase &d_add = *net->ops_[1];
+  ASSERT_EQ("rowwise_add_grad", d_add.type_);
+
+  f::OperatorBase &d_mul = *net->ops_[2];
+  ASSERT_EQ("mul_grad", d_mul.type_);
+}
+
+TEST(Backward, net_fc_backward_not_have_b) {
+  std::shared_ptr<f::OperatorBase> fwd = f::OpRegistry::CreateOp(
+      "fc", {"X", "w", f::OperatorBase::EMPTY_VAR_NAME()},
+      {"mul_result", "add_result", "tmp"}, {});
+  ASSERT_NE(fwd, nullptr);
+  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  ASSERT_TRUE(gop->IsNetOp());
+  auto net = static_cast<f::NetOp *>(gop.get());
+
+  ASSERT_NO_THROW(net->DebugString());
+
+  ASSERT_EQ(2UL, net->ops_.size());
+
+  f::OperatorBase &d_sigmoid = *net->ops_[0];
+  ASSERT_EQ("sigmoid_grad", d_sigmoid.type_);
+
+  f::OperatorBase &d_mul = *net->ops_[1];
+  ASSERT_EQ("mul_grad", d_mul.type_);
+}
+
+TEST(Backward, net_input_of_network_not_need_grad) {
+  f::NetOp net;
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"X", "W1", "b1"},
+                                    {"mul_tmp_0", "add_tmp_0", "hidden0"}, {}));
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"hidden0", "W2", "b2"},
+                                    {"mul_tmp_1", "add_tmp_1", "hidden1"}, {}));
+  net.CompleteAddOp();
+  auto bwd = Backward(net, {"X"});  // X@GRAD is not need.
+  ASSERT_TRUE(bwd->IsNetOp());
+  auto bwd_net = static_cast<f::NetOp *>(bwd.get());
+
+  std::unordered_set<std::string> all_output = std::unordered_set<std::string>(
+      bwd_net->outputs_.begin(), bwd_net->outputs_.end());
+  all_output.erase(f::OperatorBase::EMPTY_VAR_NAME());
+
+  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
+    ASSERT_NE(all_output.find(out + f::OperatorBase::GRAD_VAR_SUFFIX()),
+              all_output.end());
+  }
+
+  // Not Generated X
+  ASSERT_EQ(all_output.find("X" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            all_output.end());
+
+  ASSERT_EQ(2UL, bwd_net->ops_.size());
+  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
+  auto first_fc_grad = static_cast<f::NetOp *>(bwd_net->ops_[1].get());
+  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
+  ASSERT_EQ(
+      f::OperatorBase::EMPTY_VAR_NAME(),
+      first_fc_grad->ops_[2]->Output("A" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+}
+
+TEST(Backward, net_shared_weight) {
+  f::NetOp net;
+  net.AddOp(f::OpRegistry::CreateOp("mul", {"X", "W"}, {"Out"}, {}));
+  net.AddOp(f::OpRegistry::CreateOp("mul", {"Out", "W"}, {"FinalOut"}, {}));
+  net.CompleteAddOp();
+
+  auto bwd = f::Backward(net, {});
+  ASSERT_TRUE(bwd->IsNetOp());
+  auto bwd_net = static_cast<f::NetOp *>(bwd.get());
+  ASSERT_EQ(3UL, bwd_net->ops_.size());
+  ASSERT_EQ("add", bwd_net->ops_[2]->type_);
+}
+
+TEST(Backward, op_register_grad_not_for_network) {
+  auto fwd = f::OpRegistry::CreateOp(
+      "fc", {"X", "W", "b"}, {"mul_out", "add_out", "out1"},
+      {{"temporary_index", std::vector<int>{0, 1}}});
+
+  ASSERT_THROW(f::OpRegistry::CreateGradOp(*fwd), EnforceNotMet);
+}
+
+TEST(Backward, op_all_input_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
+  auto backward = f::Backward(*fwd, {"X", "b"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<f::NetOp *>(backward.get());
+  ASSERT_TRUE(net->ops_.empty());
+}
+
+TEST(Backward, op_all_output_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
+  auto backward = f::Backward(*fwd, {"Out"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<f::NetOp *>(backward.get());
+  ASSERT_TRUE(net->ops_.empty());
+}
+
+TEST(Backward, op_part_of_output_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("many_output_op", {"X"}, {"Y", "Z"}, {});
+  auto backward = f::Backward(*fwd, {"Z"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<f::NetOp *>(backward.get());
+  ASSERT_EQ(net->ops_.size(), 2UL);
+
+  auto &fill_zero = *net->ops_[0];
+  ASSERT_EQ("fill_zeros_like", fill_zero.type_);
+  ASSERT_EQ(1UL, fill_zero.inputs_.size());
+  ASSERT_EQ("Z", fill_zero.inputs_[0]);
+  ASSERT_EQ(1UL, fill_zero.outputs_.size());
+  ASSERT_EQ("Z" + f::OperatorBase::ZERO_VAR_SUFFIX(), fill_zero.outputs_[0]);
+
+  auto &d_many_out = *net->ops_[1];
+  ASSERT_EQ("many_output_op_grad", d_many_out.type_);
+  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.inputs_.size());  // I/O/OG
+  ASSERT_EQ("Z" + f::OperatorBase::ZERO_VAR_SUFFIX(),
+            d_many_out.Input("z" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+  ASSERT_EQ("Y" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+            d_many_out.Input("y" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+  ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+            d_many_out.Output("x" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+}
+
+TEST(Backward, op_part_of_input_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("mul", {"a", "b"}, {"out"}, {});
+  auto backward = f::Backward(*fwd, {"a"});
+  auto &grad_mul = *backward;
+  ASSERT_EQ(grad_mul.type_, "mul_grad");
+  ASSERT_EQ(grad_mul.inputs_.size(), 2UL + 1UL + 1UL);
+  ASSERT_EQ(grad_mul.outputs_.size(), 2UL);
+  ASSERT_EQ(grad_mul.Output("A" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            f::OperatorBase::EMPTY_VAR_NAME());
+  ASSERT_EQ(grad_mul.Output("B" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            "b" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  ASSERT_EQ(grad_mul.Input("Out" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            "out" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  ASSERT_EQ(grad_mul.Input("A"), "a");
+  ASSERT_EQ(grad_mul.Input("B"), "b");
+  ASSERT_EQ(grad_mul.Input("Out"), "out");
+}
+
+TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
+  f::NetOp net;
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"x1", "w1", "b1"},
+                                    {"mul_out1", "add_out1", "out1"}, {}));
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"out1", "w2", "b2"},
+                                    {"mul_out2", "tmp_out2", "out2"}, {}));
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"out2", "w3", "b3"},
+                                    {"mul_out3", "tmp_out3", "out3"}, {}));
+  net.CompleteAddOp();
+  auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto bwd_net = static_cast<f::NetOp *>(backward.get());
+  ASSERT_EQ(bwd_net->ops_.size(), 3UL);
+  auto &grad_fc = *bwd_net->ops_[0];
+  EXPECT_EQ(grad_fc.inputs_.size(),
+            3UL       /* external input number */
+                + 1UL /* external output number*/
+                + 1UL /* number of gradient of external output*/
+                - 1UL /*ignoreGradient varable number*/
+                + 2U /* internal variable number*/);
+  EXPECT_EQ(grad_fc.outputs_.size(), 2UL       /* input number of mul*/
+                                         + 2UL /* input number of rowwise_add */
+                                         + 1UL /* input number of sigmod */);
+  EXPECT_EQ(bwd_net->ops_[1]->inputs_.size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[1]->outputs_.size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[2]->inputs_.size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[2]->outputs_.size(), 0UL);
+
+  /*
+    EXPECT_EQ(grad_fc.Output("X" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+              f::OperatorBase::EMPTY_VAR_NAME());
+  EXPECT_EQ(grad_fc.Output("W" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+    "w3" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  EXPECT_EQ(grad_fc.Output("b" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+    "b3" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  EXPECT_EQ(grad_fc.Output("mul_result" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+  "mul_out3" + f::OperatorBase::GRAD_VAR_SUFFIX());
+
+  EXPECT_EQ(grad_fc.Input("Out" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+  "out3" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  EXPECT_EQ(grad_fc.Input("X"), "out2");
+  EXPECT_EQ(grad_fc.Input("W"), "w3");
+  EXPECT_EQ(grad_fc.Input("mul_result"), "mul_out3");
+  EXPECT_EQ(grad_fc.Input("add_result"), "tmp_out3");
+  EXPECT_EQ(grad_fc.Input("Out"), "out3");
+  */
+}
--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
@@ -80,5 +80,21 @@ struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
  }
 };

+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenScalar {
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  using Type = Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
+  using ConstType = Eigen::TensorMap<
+      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
+
+  static Type From(Tensor& tensor) { return Type(tensor.data<T>()); }
+
+  static ConstType From(const Tensor& tensor) {
+    return ConstType(tensor.data<T>());
+  }
+};
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/eigen_test.cc
+++ b/paddle/framework/eigen_test.cc
@@ -46,6 +46,17 @@ TEST(Eigen, Tensor) {
  }
 }

+TEST(Eigen, ScalarFrom) {
+  Tensor t;
+  int* p = t.mutable_data<int>(make_ddim({1}), platform::CPUPlace());
+  *p = static_cast<int>(100);
+
+  EigenScalar<int>::Type es = EigenScalar<int>::From(t);
+
+  ASSERT_EQ(0, es.dimension(0));
+  ASSERT_EQ(100, es(0));
+}
+
 TEST(Eigen, VectorFrom) {
  Tensor t;
  float* p = t.mutable_data<float>(make_ddim({6}), platform::CPUPlace());

--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@@ -20,7 +20,7 @@ namespace framework {

 OperatorBase* GradOpBuilder::Build() {
  BuildOpInOutArgList();
-  std::string grad_op_type = OpRegistry::grad_ops().at(op_->type_);
+  std::string grad_op_type = OpRegistry::grad_ops().at(op_.type_);
  OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)();
  grad_op->type_ = grad_op_type;
  CompleteGradOp(grad_op);
@@ -39,15 +39,15 @@ OpInOutArg* GradOpBuilder::BuildArg(const VarProto& var,
 }

 void GradOpBuilder::BuildOpInOutArgList() {
-  const OpProto& op_proto = OpRegistry::protos().at(op_->type_);
-  const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_->type_));
+  const OpProto& op_proto = OpRegistry::protos().at(op_.type_);
+  const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_.type_));
  const std::vector<int>& in_format =
-      op_->attrs_.count("input_format")
-          ? op_->GetAttr<std::vector<int>>("input_format")
+      op_.attrs_.count("input_format")
+          ? op_.GetAttr<std::vector<int>>("input_format")
          : std::vector<int>();
  const std::vector<int>& out_format =
-      op_->attrs_.count("output_format")
-          ? op_->GetAttr<std::vector<int>>("output_format")
+      op_.attrs_.count("output_format")
+          ? op_.GetAttr<std::vector<int>>("output_format")
          : std::vector<int>();
  for (const auto& var : op_proto.inputs()) {
    arg_list_.emplace_back(
@@ -70,8 +70,7 @@ void GradOpBuilder::AddArgIntoGradOp(const OpInOutArg* arg,
  }
  (*varmap)[var_name] = idx++;
  size_t pre_sz = in_out.size();
-  auto base_it =
-      arg->type_ == IN ? op_->inputs_.begin() : op_->outputs_.begin();
+  auto base_it = arg->type_ == IN ? op_.inputs_.begin() : op_.outputs_.begin();
  std::copy(base_it + arg->begin_idx_, base_it + arg->end_idx_,
            std::back_inserter(in_out));
  if (is_grad) {
@@ -83,7 +82,7 @@ void GradOpBuilder::AddArgIntoGradOp(const OpInOutArg* arg,
 }

 void GradOpBuilder::CompleteGradOp(OperatorBase* grad_op) const {
-  grad_op->attrs_ = op_->attrs_;
+  grad_op->attrs_ = op_.attrs_;
  grad_op->attrs_.erase("input_format");
  grad_op->attrs_.erase("output_format");
  VarIndexMap* grad_varmap = new VarIndexMap();

--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/grad_op_builder.h
@@ -29,7 +29,7 @@ class GradOpBuilder {
  using VarIndexMap = std::unordered_map<std::string, int>;

 public:
-  GradOpBuilder(const OperatorBase* op) : op_(op) {}
+  GradOpBuilder(const OperatorBase& op) : op_(op) {}
  OperatorBase* Build();

 private:
@@ -40,7 +40,7 @@ class GradOpBuilder {
                        std::vector<int>& format, VarIndexMap* varmap, int& idx,
                        bool is_grad) const;
  void CompleteGradOp(OperatorBase* grad_op) const;
-  const OperatorBase* op_;
+  const OperatorBase& op_;
  std::vector<std::shared_ptr<OpInOutArg>> arg_list_;
 };


--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -11,7 +11,7 @@ namespace framework {
 TEST(GradOpBuilder, AddTwo) {
  std::shared_ptr<OperatorBase> add_op(
      OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
-  std::shared_ptr<OperatorBase> grad_add_op = OpRegistry::CreateGradOp(add_op);
+  std::shared_ptr<OperatorBase> grad_add_op = OpRegistry::CreateGradOp(*add_op);
  EXPECT_EQ(static_cast<int>(grad_add_op->inputs_.size()), 4);
  EXPECT_EQ(static_cast<int>(grad_add_op->outputs_.size()), 2);
  EXPECT_EQ(grad_add_op->Input("X"), "x");

--- a/paddle/framework/images/duplicate_op.graffle
+++ b/paddle/framework/images/duplicate_op.graffle
--- a/paddle/framework/images/duplicate_op.png
+++ b/paddle/framework/images/duplicate_op.png
--- a/paddle/framework/images/duplicate_op2.graffle
+++ b/paddle/framework/images/duplicate_op2.graffle
--- a/paddle/framework/images/duplicate_op2.png
+++ b/paddle/framework/images/duplicate_op2.png
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -43,7 +43,7 @@ class NetOp : public OperatorBase {
   * Infer all the operators' input and output variables' shapes, will be called
   * before every mini-batch
   */
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {
+  void InferShape(const Scope& scope) const override {
    for (auto& op : ops_) {
      op->InferShape(scope);
    }
@@ -56,7 +56,7 @@ class NetOp : public OperatorBase {
   * scope will be used instead. If no OpContext is provicded, default context
   * will be used.
   */
-  void Run(const std::shared_ptr<Scope>& scope,
+  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {
    for (auto& op : ops_) {
      op->Run(scope, dev_ctx);
@@ -68,9 +68,18 @@ class NetOp : public OperatorBase {
   */
  void AddOp(const std::shared_ptr<OperatorBase>& op) {
    PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
+    PADDLE_ENFORCE(op != nullptr, "Cannot Insert Null op");
    ops_.push_back(op);
  }

+  void InsertOp(size_t pos, const std::shared_ptr<OperatorBase>& op) {
+    PADDLE_ENFORCE(!add_op_done_,
+                   "Cannot InsertOp when this network is sealed");
+    PADDLE_ENFORCE(op != nullptr, "Cannot Insert Null op");
+    PADDLE_ENFORCE(pos <= ops_.size(), "Out of range");
+    ops_.insert(ops_.begin() + pos, op);
+  }
+
  void CompleteAddOp(bool calculate = true);

  std::string DebugString() const override;

--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
@@ -3,11 +3,6 @@
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/operator.h>

-USE_OP(add_two);
-USE_OP(mul);
-USE_OP(sigmoid);
-USE_OP(softmax);
-
 namespace paddle {
 namespace framework {

@@ -16,16 +11,22 @@ static int run_cnt = 0;

 class TestOp : public OperatorBase {
 public:
-  void InferShape(
-      const std::shared_ptr<framework::Scope>& scope) const override {
+  void InferShape(const framework::Scope& scope) const override {
    ++infer_shape_cnt;
  }
-  void Run(const std::shared_ptr<framework::Scope>& scope,
+  void Run(const framework::Scope& scope,
           const paddle::platform::DeviceContext& dev_ctx) const override {
    ++run_cnt;
  }
 };

+class EmptyOp : public OperatorBase {
+ public:
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+};
+
 template <typename T>
 void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
                                  const std::vector<T>& actual) {
@@ -62,7 +63,7 @@ TEST(OpKernel, all) {
  ASSERT_EQ(1UL, tmp_idx.size());
  ASSERT_EQ("y", net->outputs_[tmp_idx[0]]);

-  auto scope = std::make_shared<Scope>();
+  Scope scope;
  platform::CPUDeviceContext dev_ctx;

  net->InferShape(scope);
@@ -72,20 +73,17 @@ TEST(OpKernel, all) {
  ASSERT_THROW(net->AddOp(op2), paddle::platform::EnforceNotMet);
 }

-//! TODO(yuyang18): Refine Backward Op.
-// TEST(AddBackwardOp, TestGradOp) {
-//  auto net = std::make_shared<NetOp>();
-//  ASSERT_NE(net, nullptr);
-//  net->AddOp(framework::OpRegistry::CreateOp("mul", {"X", "Y"}, {"Out"}, {}));
-//  net->AddOp(
-//      framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {}));
-//  net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""},
-//  {}));
-//  auto grad_ops = AddBackwardOp(net);
-//  for (auto& op : grad_ops->ops_) {
-//    op->DebugString();
-//  }
-//}
+TEST(Net, insert_op) {
+  NetOp net;
+  auto op1 = std::make_shared<EmptyOp>();
+  op1->inputs_ = {"x", "w1", "b1"};
+  op1->outputs_ = {"y"};
+  net.AddOp(op1);
+  net.InsertOp(0, op1);
+  ASSERT_EQ(2UL, net.ops_.size());
+  net.InsertOp(2, op1);
+  ASSERT_EQ(3UL, net.ops_.size());
+}

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -86,43 +86,46 @@ class OpProtoAndCheckerMaker {
  }

 protected:
-  void AddInput(const std::string& name, const std::string& comment,
-                bool multiple = false, bool ignore_gradient = false) {
+  struct VariableBuilder {
+    VarProto* var_;
+    std::function<void()> on_multiple_;
+    std::function<void()> on_temporary_;
+
+    VariableBuilder& SetMultiple() {
+      var_->set_multiple(true);
+      on_multiple_();
+      return *this;
+    }
+
+    VariableBuilder& SetTemporary() {
+      PADDLE_ENFORCE(bool(on_temporary_), "Cannot set temporary");
+      var_->set_temporary(true);
+      on_temporary_();
+      return *this;
+    }
+
+    VariableBuilder& IgnoreGradient() {
+      var_->set_ignore_gradient(true);
+      return *this;
+    }
+  };
+
+  VariableBuilder AddInput(const std::string& name,
+                           const std::string& comment) {
    auto input = proto_->mutable_inputs()->Add();
    *input->mutable_name() = name;
    *input->mutable_comment() = comment;
-    input->set_ignore_gradient(ignore_gradient);
-    input->set_multiple(multiple);
-    if (multiple) {
-      SetHasMultipleInput();
-    }
-  }
-
-  void AddInputs(const std::string& name, const std::string& comment,
-                 bool ignore_gradient = false) {
-    AddInput(name, comment, true, ignore_gradient);
+    return VariableBuilder{input, [=] { this->SetHasMultipleInput(); },
+                           nullptr};
  }

-  void AddOutput(const std::string& name, const std::string& comment,
-                 bool temporary = false, bool multiple = false,
-                 bool ignore_gradient = false) {
+  VariableBuilder AddOutput(const std::string& name,
+                            const std::string& comment) {
    auto output = proto_->mutable_outputs()->Add();
    *output->mutable_name() = name;
    *output->mutable_comment() = comment;
-    output->set_ignore_gradient(ignore_gradient);
-    output->set_multiple(multiple);
-    if (multiple) {
-      SetHasMultipleOutput();
-    }
-    output->set_temporary(temporary);
-    if (temporary) {
-      SetHasTemporaryOutput();
-    }
-  }
-
-  void AddOutputs(const std::string& name, const std::string& comment,
-                  bool temporary = false, bool ignore_gradient = false) {
-    AddOutput(name, comment, temporary, true, ignore_gradient);
+    return VariableBuilder{output, [=] { this->SetHasMultipleOutput(); },
+                           [=] { this->SetHasTemporaryOutput(); }};
  }

  template <typename T>
@@ -300,9 +303,10 @@ class OpRegistry {
    return CreateOp(op_desc.type(), inputs, outputs, attrs);
  }

-  static std::shared_ptr<OperatorBase> CreateGradOp(
-      std::shared_ptr<OperatorBase> op) {
-    GradOpBuilder builder(op.get());
+  static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) {
+    PADDLE_ENFORCE(!op.IsNetOp(),
+                   "Use framework::Backward to get backward ops");
+    GradOpBuilder builder(op);
    std::shared_ptr<OperatorBase> grad_op(builder.Build());
    grad_op->Init();
    return grad_op;

--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -7,9 +7,9 @@ namespace paddle {
 namespace framework {
 class CosineOp : public OperatorBase {
 public:
-  void Run(const std::shared_ptr<Scope>& scope,
+  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void InferShape(const Scope& scope) const override {}
 };

 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -27,8 +27,8 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {

 class MyTestOp : public OperatorBase {
 public:
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}
 };

@@ -36,9 +36,8 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 public:
  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInputs("input", "input of cosine op");
-    AddOutput("output", "output of cosine op",
-              /*temporary*/ true);
+    AddInput("input", "input of cosine op").SetMultiple();
+    AddOutput("output", "output of cosine op").SetTemporary();
    auto my_checker = [](int i) {
      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
    };
@@ -69,7 +68,7 @@ TEST(OpRegistry, CreateOp) {

  std::shared_ptr<paddle::framework::OperatorBase> op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
-  auto scope = std::make_shared<paddle::framework::Scope>();
+  paddle::framework::Scope scope;
  paddle::platform::CPUDeviceContext dev_ctx;
  op->Run(scope, dev_ctx);
  float scale_get = op->GetAttr<float>("scale");
@@ -111,7 +110,7 @@ TEST(OpRegistry, DefaultValue) {

  std::shared_ptr<paddle::framework::OperatorBase> op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
-  auto scope = std::make_shared<paddle::framework::Scope>();
+  paddle::framework::Scope scope;
  paddle::platform::CPUDeviceContext dev_ctx;
  op->Run(scope, dev_ctx);
  ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
@@ -173,7 +172,7 @@ TEST(OpRegistry, CustomChecker) {
  SetInputFormat(&op_desc);
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  paddle::platform::CPUDeviceContext dev_ctx;
-  auto scope = std::make_shared<paddle::framework::Scope>();
+  paddle::framework::Scope scope;
  op->Run(scope, dev_ctx);
  int test_attr = op->GetAttr<int>("test_attr");
  ASSERT_EQ(test_attr, 4);

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace framework {

 template <>
-Eigen::DefaultDevice* KernelContext::GetEigenDevice<
+Eigen::DefaultDevice* ExecutionContext::GetEigenDevice<
    platform::CPUPlace, Eigen::DefaultDevice>() const {
  return device_context_.get_eigen_device<Eigen::DefaultDevice>();
 }
@@ -28,7 +28,7 @@ Eigen::DefaultDevice* KernelContext::GetEigenDevice<
 #ifndef PADDLE_ONLY_CPU
 template <>
 Eigen::GpuDevice*
-KernelContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
+ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
  return device_context_.get_eigen_device<Eigen::GpuDevice>();
 }
 #endif
@@ -52,7 +52,8 @@ std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "IO Idx could not be nullptr");
  auto input_format = GetAttr<std::vector<int>>("input_format");
  auto offset = in_out_idxs_->at(name);
-  PADDLE_ENFORCE(input_format.at((size_t)offset + 1) <= inputs_.size(),
+  PADDLE_ENFORCE(input_format.at(static_cast<size_t>(offset) + 1) <=
+                     static_cast<int>(inputs_.size()),
                 "Input Out Of Range");

  return std::vector<std::string>{
@@ -78,7 +79,8 @@ std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr");
  auto output_format = GetAttr<std::vector<int>>("output_format");
  auto offset = in_out_idxs_->at(name);
-  PADDLE_ENFORCE(output_format.at((size_t)offset + 1) <= outputs_.size(),
+  PADDLE_ENFORCE(output_format.at(static_cast<size_t>(offset) + 1) <=
+                     static_cast<int>(outputs_.size()),
                 "Output Out of Range");
  return std::vector<std::string>{
      outputs_.begin() + output_format.at(offset),
@@ -105,5 +107,11 @@ std::string OperatorBase::DebugString() const {
  return ss.str();
 }

+void OperatorBase::Rename(const std::string& old_name,
+                          const std::string& new_name) {
+  std::replace(inputs_.begin(), inputs_.end(), old_name, new_name);
+  std::replace(outputs_.begin(), outputs_.end(), old_name, new_name);
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <algorithm>
 #include <boost/variant.hpp>
 #include <string>
 #include <unordered_map>
@@ -31,22 +32,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-template <typename T>
-struct EigenDeviceConverter;
-
-template <>
-struct EigenDeviceConverter<platform::CPUPlace> {
-  using EigenDeviceType = Eigen::DefaultDevice;
-};
-
-#ifndef PADDLE_ONLY_CPU
-template <>
-struct EigenDeviceConverter<platform::GPUPlace> {
-  using EigenDeviceType = Eigen::GpuDevice;
-};
-#endif
-
 class OperatorBase;
+class InferShapeContext;
+class ExecutionContext;
 /**
 * OperatorBase has the basic element that Net will call to do computation.
 * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -67,6 +55,9 @@ class OperatorBase {
  /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
  static std::string GRAD_VAR_SUFFIX() { return "@GRAD"; }

+  /// Variables with this suffix are supposed to be filled up with zeros.
+  static std::string ZERO_VAR_SUFFIX() { return "@ZERO"; }
+
  virtual ~OperatorBase() {}

  template <typename T>
@@ -84,16 +75,20 @@ class OperatorBase {

  /// InferShape infer the size of Variables used by this Operator with
  /// information inside scope
-  virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
+  virtual void InferShape(const Scope& scope) const = 0;

  /// Net will call this function to Run an op.
-  virtual void Run(const std::shared_ptr<Scope>& scope,
+  virtual void Run(const Scope& scope,
                   const platform::DeviceContext& dev_ctx) const = 0;

  virtual bool IsNetOp() const { return false; }

+  /// rename inputs outputs name
+  void Rename(const std::string& old_name, const std::string& new_name);
+
  //! Get a input with argument's name described in `op_proto`
  const std::string& Input(const std::string& name) const;
+
  //! Get a input which has multiple variables.
  //! TODO add a vector_view to prevent memory copy.
  std::vector<std::string> Inputs(const std::string& name) const;
@@ -105,53 +100,156 @@ class OperatorBase {

 public:
  std::string type_;
+  // NOTE: in case of OpGrad, inputs_ contains:
+  // I (Inputs)
+  // O (Outputs)
+  // OG (Output Gradients)
  std::vector<std::string> inputs_;
+  // NOTE: in case of OpGrad, outputs_ contains
+  // IG (Inputs Gradients)
  std::vector<std::string> outputs_;
  AttributeMap attrs_;
  // store the arguments' offset described in op_desc.
  std::shared_ptr<std::unordered_map<std::string, int>> in_out_idxs_;
 };

-class KernelContext {
+class OperatorContext {
 public:
-  KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
-                const platform::DeviceContext& device_context)
-      : op_(*op), scope_(scope), device_context_(device_context) {}
+  OperatorContext(const OperatorBase* op, const Scope& scope)
+      : op_(*op), scope_(scope) {}
+
+  size_t InputSize() const { return op_.inputs_.size(); }

-  const Variable* Input(int index) const {
-    return scope_->GetVariable(op_.inputs_[index]);
+  size_t OutputSize() const { return op_.outputs_.size(); }
+
+  const Variable* InputVar(const size_t index) const {
+    return scope_.FindVar(op_.inputs_.at(index));
  }

-  Variable* Output(int index) const {
-    return scope_->GetVariable(op_.outputs_[index]);
+  Variable* OutputVar(const size_t index) const {
+    return scope_.FindVar(op_.outputs_.at(index));
  }

-  const Variable* Input(const std::string& name) const {
-    return scope_->GetVariable(op_.Input(name));
+  const Variable* InputVar(const std::string& name) const {
+    return scope_.FindVar(op_.Input(name));
  }

-  const Variable* Output(const std::string& name) const {
-    return scope_->GetVariable(op_.Output(name));
+  Variable* OutputVar(const std::string& name) const {
+    return scope_.FindVar(op_.Output(name));
  }

-  const std::vector<const Variable*> Inputs(const std::string& name) const {
+  const std::vector<const Variable*> MultiInputVar(
+      const std::string& name) const {
    auto names = op_.Inputs(name);
    std::vector<const Variable*> res;
+    res.reserve(names.size());
    std::transform(
-        names.begin(), names.end(), res.begin(),
-        [this](const std::string& name) { return scope_->GetVariable(name); });
+        names.begin(), names.end(), std::back_inserter(res),
+        [this](const std::string& name) { return scope_.FindVar(name); });
    return res;
  }

-  const std::vector<const Variable*> Outputs(const std::string& name) const {
+  std::vector<const Variable*> MultiOutputVar(const std::string& name) const {
    auto names = op_.Outputs(name);
    std::vector<const Variable*> res;
+    res.reserve(names.size());
    std::transform(
-        names.begin(), names.end(), res.begin(),
-        [this](const std::string& name) { return scope_->GetVariable(name); });
+        names.begin(), names.end(), std::back_inserter(res),
+        [this](const std::string& name) { return scope_.FindVar(name); });
+    return res;
+  }
+
+  template <typename T>
+  const T* Input(const size_t index) const {
+    auto var = InputVar(index);
+    PADDLE_ENFORCE(var != nullptr, "Input(%d) should not be nullptr", index);
+    return &var->Get<T>();
+  }
+
+  template <typename T>
+  T* Output(const size_t index) const {
+    auto var = OutputVar(index);
+    PADDLE_ENFORCE(var != nullptr, "Output(%d) should not be nullptr", index);
+    return var->GetMutable<T>();
+  }
+
+  template <typename T>
+  const T* Input(const std::string& name) const {
+    auto var = InputVar(name);
+    PADDLE_ENFORCE(var != nullptr, "Input(%s) should not be nullptr", name);
+    return &var->Get<T>();
+  }
+
+  template <typename T>
+  T* Output(const std::string& name) const {
+    auto var = OutputVar(name);
+    PADDLE_ENFORCE(var != nullptr, "Output(%s) should not be nullptr", name);
+    return var->GetMutable<T>();
+  }
+
+  template <typename T>
+  const std::vector<const T*> MultiInput(const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const T*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [&](const std::string& sub_name) {
+                     auto var = scope_.FindVar(sub_name);
+                     PADDLE_ENFORCE(var != nullptr,
+                                    "MultiInput(%s:%s) should not be nullptr",
+                                    name, sub_name);
+                     return &var->Get<T>();
+                   });
+    return res;
+  }
+
+  template <typename T>
+  std::vector<const T*> MultiOutput(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<const T*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [&](const std::string& sub_name) {
+                     auto var = scope_.FindVar(sub_name);
+                     PADDLE_ENFORCE(var != nullptr,
+                                    "MultiOutput(%s:%s) should not be nullptr",
+                                    name, sub_name);
+                     return var->GetMutable<T>();
+                   });
    return res;
  }

+  const OperatorBase& op_;
+  const Scope& scope_;
+};
+
+class InferShapeContext : public OperatorContext {
+ public:
+  InferShapeContext(const OperatorBase* op, const Scope& scope)
+      : OperatorContext(op, scope) {}
+};
+
+template <typename T>
+struct EigenDeviceConverter;
+
+template <>
+struct EigenDeviceConverter<platform::CPUPlace> {
+  using EigenDeviceType = Eigen::DefaultDevice;
+};
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+struct EigenDeviceConverter<platform::GPUPlace> {
+  using EigenDeviceType = Eigen::GpuDevice;
+};
+#endif
+
+class ExecutionContext : public OperatorContext {
+ public:
+  ExecutionContext(const OperatorBase* op, const Scope& scope,
+                   const platform::DeviceContext& device_context)
+      : OperatorContext(op, scope), device_context_(device_context) {}
+
  template <typename PlaceType,
            typename DeviceType =
                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
@@ -159,38 +257,23 @@ class KernelContext {

  platform::Place GetPlace() const { return device_context_.GetPlace(); }

-  const OperatorBase& op_;
-  const std::shared_ptr<Scope>& scope_;
  const platform::DeviceContext& device_context_;
 };

 class OpKernel {
 public:
  /**
-   * KernelContext is the only parameter of Kernel Run function.
+   * ExecutionContext is the only parameter of Kernel Run function.
   * Run will get input/output variables, state such as momentum and
   * device resource such as CUDA stream, cublas handle, etc. from
-   * KernelContext. User should construct it before run the Operator.
+   * ExecutionContext. User should construct it before run the Operator.
   */

-  virtual void Compute(const KernelContext& context) const = 0;
+  virtual void Compute(const ExecutionContext& context) const = 0;

  virtual ~OpKernel() {}
 };

-template <typename T>
-struct VarToTensor {};
-
-template <>
-struct VarToTensor<Tensor*> {
-  Tensor* operator()(Variable* var) { return var->GetMutable<Tensor>(); }
-};
-
-template <>
-struct VarToTensor<const Tensor*> {
-  const Tensor* operator()(Variable* var) { return &var->Get<Tensor>(); }
-};
-
 class OperatorWithKernel : public OperatorBase {
 public:
  struct OpKernelKey {
@@ -216,10 +299,14 @@ class OperatorWithKernel : public OperatorBase {
  using OpKernelMap =
      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;

-  void Run(const std::shared_ptr<Scope>& scope,
+  void InferShape(const Scope& scope) const {
+    InferShape(InferShapeContext(this, scope));
+  }
+
+  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const final {
    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(KernelContext(this, scope, dev_ctx));
+    opKernel->Compute(ExecutionContext(this, scope, dev_ctx));
  }

  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@@ -228,34 +315,8 @@ class OperatorWithKernel : public OperatorBase {
    return g_all_op_kernels;
  }

-  void InferShape(const std::shared_ptr<Scope>& scope) const final {
-    std::vector<const Tensor*> ins;
-    VarNamesToTensors(scope, inputs_, &ins);
-    std::vector<Tensor*> outs;
-    VarNamesToTensors(scope, outputs_, &outs);
-    InferShape(ins, outs);
-  };
-
- private:
-  template <typename T>
-  void VarNamesToTensors(const std::shared_ptr<Scope>& scope,
-                         const std::vector<std::string>& var_names,
-                         std::vector<T>* container) const {
-    container->reserve(var_names.size());
-    VarToTensor<T> convert;
-    for (auto& name : var_names) {
-      auto var = scope->GetVariable(name);
-      if (var != nullptr) {
-        container->push_back(convert(var));
-      } else {
-        container->push_back(nullptr);
-      }
-    }
-  }
-
 protected:
-  virtual void InferShape(const std::vector<const Tensor*>& inputs,
-                          const std::vector<Tensor*>& outputs) const = 0;
+  virtual void InferShape(const InferShapeContext& ctx) const = 0;
 };

 }  // namespace framework

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -24,15 +24,15 @@ static int op_run_num = 0;
 class OpWithoutKernelTest : public OperatorBase {
 public:
  void Init() override { x = 1; }
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {
    op_run_num++;
    ASSERT_EQ((int)inputs_.size(), 1);
    ASSERT_EQ((int)outputs_.size(), 1);
-    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_EQ(scope.FindVar(inputs_[0]), nullptr);
    ASSERT_EQ(x, 1);
-    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
+    ASSERT_NE(scope.FindVar(outputs_[0]), nullptr);
  }

 public:
@@ -68,11 +68,12 @@ TEST(OperatorBase, all) {
  attr->set_f(3.14);

  paddle::platform::CPUDeviceContext device_context;
-  auto scope = std::make_shared<paddle::framework::Scope>();
+  paddle::framework::Scope scope;

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  scope->CreateVariable("OUT1");
+  scope.NewVar("OUT1");
  ASSERT_EQ(paddle::framework::op_run_num, 0);
+  op->InferShape(scope);
  op->Run(scope, device_context);
  ASSERT_EQ(paddle::framework::op_run_num, 1);
 }
@@ -97,14 +98,13 @@ static int cpu_kernel_run_num = 0;

 class OpWithKernelTest : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor*>& inputs,
-                  const std::vector<Tensor*>& outputs) const override {}
+  void InferShape(const framework::InferShapeContext& ctx) const override {}
 };

 template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel {
 public:
-  void Compute(const KernelContext& ctx) const {
+  void Compute(const ExecutionContext& ctx) const {
    std::cout << "this is cpu kernel" << std::endl;
    std::cout << ctx.op_.DebugString() << std::endl;
    cpu_kernel_run_num++;
@@ -117,12 +117,12 @@ class CPUKernelTest : public OpKernel {
 class OperatorMultiInputsTest : public OperatorBase {
 public:
  void Init() override { x = 1; }
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {
-    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_EQ(scope.FindVar(inputs_[0]), nullptr);
    ASSERT_EQ(x, 1);
-    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
+    ASSERT_NE(scope.FindVar(outputs_[0]), nullptr);
    ASSERT_EQ(Input("x"), "IN1");
    ASSERT_EQ(Input("y"), "OUT1");
  }
@@ -137,9 +137,9 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
  OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
                                              OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInputs("xs", "inputs of test op");
+    AddInput("xs", "inputs of test op").SetMultiple();
    AddInput("k", "input of test op");
-    AddOutputs("ys", "outputs of test op");
+    AddOutput("ys", "outputs of test op").SetMultiple();
    AddAttr<float>("scale", "scale of cosine op")
        .SetDefault(1.0)
        .LargerThan(0.0);
@@ -149,13 +149,31 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker

 class CPUKernalMultiInputsTest : public OpKernel {
 public:
-  void Compute(const KernelContext& ctx) const {
+  void Compute(const ExecutionContext& ctx) const {
    auto xs = ctx.op_.Inputs("xs");
    ASSERT_EQ(xs.size(), 3UL);
    ASSERT_EQ(xs[0], "x0");
    ASSERT_EQ(xs[1], "x1");
    ASSERT_EQ(xs[2], "x2");

+    auto inVar0 = ctx.MultiInputVar("xs");
+    ASSERT_EQ(inVar0.size(), 3);
+
+    auto intVar1 = ctx.InputVar("k");
+    ASSERT_NE(intVar1, nullptr);
+
+    auto outVar0 = ctx.MultiOutputVar("ys");
+    ASSERT_EQ(outVar0.size(), 2);
+
+    auto inTensor0 = ctx.MultiInput<Tensor>("xs");
+    ASSERT_EQ(inTensor0.size(), 3);
+
+    auto intTensor1 = ctx.Input<Tensor>("k");
+    ASSERT_NE(intTensor1, nullptr);
+
+    auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
+    ASSERT_EQ(outTensor0.size(), 2);
+
    auto k = ctx.op_.Input("k");
    ASSERT_EQ(k, "k0");

@@ -186,7 +204,7 @@ TEST(OpKernel, all) {
  attr->set_f(3.14);

  paddle::platform::CPUDeviceContext cpu_device_context;
-  auto scope = std::make_shared<paddle::framework::Scope>();
+  paddle::framework::Scope scope;

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
@@ -232,7 +250,13 @@ TEST(OpKernel, multi_inputs) {
  output_format->Add(2);  // y1

  paddle::platform::CPUDeviceContext cpu_device_context;
-  auto scope = std::make_shared<Scope>();
+  paddle::framework::Scope scope;
+  scope.NewVar("x0")->GetMutable<Tensor>();
+  scope.NewVar("x1")->GetMutable<Tensor>();
+  scope.NewVar("x2")->GetMutable<Tensor>();
+  scope.NewVar("k0")->GetMutable<Tensor>();
+  scope.NewVar("y0")->GetMutable<Tensor>();
+  scope.NewVar("y1")->GetMutable<Tensor>();

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  op->Run(scope, cpu_device_context);

--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/scope.h"
+#include "paddle/string/printf.h"
+
+namespace paddle {
+namespace framework {
+
+Scope::~Scope() {
+  DropKids();
+  for (auto& kv : vars_) delete kv.second;
+}
+
+Scope& Scope::NewScope() const {
+  kids_.push_back(new Scope(this));
+  return *kids_.back();
+}
+
+Variable* Scope::NewVar(const std::string& name) {
+  auto iter = vars_.find(name);
+  if (iter != vars_.end()) {
+    return iter->second;
+  }
+  Variable* v = new Variable();
+  vars_[name] = v;
+  v->name_ = &(vars_.find(name)->first);
+  return v;
+}
+
+Variable* Scope::NewVar() {
+  return NewVar(string::Sprintf("%p.%d", this, vars_.size()));
+}
+
+Variable* Scope::FindVar(const std::string& name) const {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) return it->second;
+  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+}
+
+const Scope* Scope::FindScope(const Variable* var) const {
+  for (auto& kv : vars_) {
+    if (kv.second == var) {
+      return this;
+    }
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+}
+void Scope::DropKids() {
+  for (Scope* s : kids_) delete s;
+  kids_.clear();
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -14,9 +14,9 @@ limitations under the License. */

 #pragma once

+#include <list>
 #include <string>
 #include <unordered_map>
-#include <vector>

 #include "paddle/framework/variable.h"

@@ -35,73 +35,42 @@ class Scope;
 */
 class Scope {
 public:
-  /**
-   * @brief Initialize s Scope without parent.
-   */
  Scope() {}
+  ~Scope();

-  /**
-   * @brief Initialize a Scope with parent.
-   */
-  explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
-
-  /**
-   * @brief Create Variable
-   *
-   * Create Variable in this Scope. Return the exist one if Variable already
-   * been created.
-   */
-  Variable* CreateVariable(const std::string& name) {
-    auto var = GetVariable(name);
-    if (var) {
-      return var;
-    } else {
-      auto ptr = new Variable();
-      name_to_var_[name] = std::unique_ptr<Variable>(ptr);
-      var_to_name_[ptr] = name;
-      return GetVariable(name);
-    }
-  }
-
-  /**
-   * @brief Get Variable.
-   *
-   * Get Variable from this Scope, this function will recursive find Variable
-   * from it's parent scope. Return nullptr if not found.
-   */
-  Variable* GetVariable(const std::string& name) const {
-    auto it = name_to_var_.find(name);
-    if (it != name_to_var_.end()) {
-      return it->second.get();
-    } else if (parent_ != nullptr) {
-      return parent_->GetVariable(name);
-    } else {
-      return nullptr;
-    }
-  }
-
-  /**
-   * @brief If this scope has a Var named name.
-   *
-   * Find if there is a Variable in this scope and it's parent scope
-   */
-  bool HasVariable(const std::string& name) const {
-    return (name_to_var_.find(name) != name_to_var_.end() ||
-            (parent_ && parent_->HasVariable(name)));
-  }
-
-  std::string GetVariableName(Variable* const var) const {
-    try {
-      return var_to_name_.at(var);
-    } catch (...) {
-      return "";
-    }
-  }
+  // Disable Copy, Assign, Move.
+  Scope(const Scope& other) = delete;
+  Scope& operator=(const Scope& other) = delete;
+  Scope(Scope&& other) = delete;
+
+  /// Create a sub-scope. Returns a reference other than a pointer so
+  /// to prevent from manual deletion.
+  /// Mark it to const because that new kid scope cannot change parent scope.
+  Scope& NewScope() const;
+
+  /// Create a variable with given name if it doesn't exist.
+  Variable* NewVar(const std::string& name);
+
+  /// Create a variable with a scope-unique name.
+  Variable* NewVar();
+
+  /// Find a variable in the scope or any of its ancestors.  Returns
+  /// nullptr if cannot find.
+  Variable* FindVar(const std::string& name) const;
+
+  /// Find the scope or an ancestor scope that contains the given variable.
+  const Scope* FindScope(const Variable* var) const;
+
+  /// Drop all kids scopes belonged to this scope.
+  void DropKids();

 private:
-  std::unordered_map<Variable*, std::string> var_to_name_;
-  std::unordered_map<std::string, std::unique_ptr<Variable>> name_to_var_;
-  std::shared_ptr<Scope> parent_{nullptr};
+  // Call Scope::NewScope for a sub-scope.
+  explicit Scope(Scope const* parent) : parent_(parent) {}
+
+  std::unordered_map<std::string, Variable*> vars_;
+  mutable std::list<Scope*> kids_;
+  Scope const* parent_{nullptr};
 };

 }  // namespace framework

--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -15,49 +15,42 @@ limitations under the License. */
 #include "paddle/framework/scope.h"
 #include "gtest/gtest.h"

-TEST(Scope, Create) {
-  using paddle::framework::Scope;
-  using paddle::framework::Variable;
+using paddle::framework::Scope;
+using paddle::framework::Variable;

-  auto scope = std::make_shared<Scope>();
+TEST(Scope, VarsShadowing) {
+  Scope s;
+  Scope& ss1 = s.NewScope();
+  Scope& ss2 = s.NewScope();

-  Variable* var0 = scope->CreateVariable("");
-  EXPECT_NE(var0, nullptr);
+  Variable* v0 = s.NewVar("a");
+  Variable* v1 = ss1.NewVar("a");

-  /// GetVariable will return nullptr if not exist.
-  Variable* var1 = scope->GetVariable("a");
-  EXPECT_EQ(var1, nullptr);
+  EXPECT_NE(v0, v1);

-  /// CreateVariable will return one.
-  Variable* var2 = scope->CreateVariable("a");
-  EXPECT_NE(var2, nullptr);
-
-  /// Get the created variable.
-  Variable* var3 = scope->GetVariable("a");
-  EXPECT_EQ(var2, var3);
+  EXPECT_EQ(v0, s.FindVar("a"));
+  EXPECT_EQ(v1, ss1.FindVar("a"));
+  EXPECT_EQ(v0, ss2.FindVar("a"));
+}

-  /// CreateVariable will just return the variable if it's
-  /// already exist.
-  Variable* var4 = scope->CreateVariable("a");
-  EXPECT_EQ(var4, var2);
+TEST(Scope, FindVar) {
+  Scope s;
+  Scope& ss = s.NewScope();

-  EXPECT_EQ("a", scope->GetVariableName(var4));
-  Scope scope2;
-  auto var = scope2.CreateVariable("tmp");
-  EXPECT_EQ("", scope->GetVariableName(var));
-}
+  EXPECT_EQ(nullptr, s.FindVar("a"));
+  EXPECT_EQ(nullptr, ss.FindVar("a"));

-TEST(Scope, Parent) {
-  using paddle::framework::Scope;
-  using paddle::framework::Variable;
+  ss.NewVar("a");

-  auto parent_scope = std::make_shared<Scope>();
-  auto scope = std::make_shared<Scope>(parent_scope);
+  EXPECT_EQ(nullptr, s.FindVar("a"));
+  EXPECT_NE(nullptr, ss.FindVar("a"));
+}

-  Variable* var0 = parent_scope->CreateVariable("a");
-  EXPECT_NE(var0, nullptr);
+TEST(Scope, FindScope) {
+  Scope s;
+  Scope& ss = s.NewScope();
+  Variable* v = s.NewVar("a");

-  /// GetVariable will get Variable from parent scope if exist.
-  Variable* var1 = scope->GetVariable("a");
-  EXPECT_EQ(var0, var1);
+  EXPECT_EQ(&s, s.FindScope(v));
+  EXPECT_EQ(&s, ss.FindScope(v));
 }
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -16,7 +16,7 @@
 #include <typeindex>
 #include <typeinfo>

-#include "paddle/platform/assert.h"
+#include "paddle/platform/enforce.h"

 namespace paddle {
 namespace framework {
@@ -25,7 +25,7 @@ class Variable {
 public:
  template <typename T>
  const T& Get() const {
-    PADDLE_ASSERT(IsType<T>());
+    PADDLE_ENFORCE(IsType<T>(), "Variable must be type %s", typeid(T).name());
    return *static_cast<const T*>(holder_->Ptr());
  }

@@ -65,6 +65,17 @@ class Variable {

  std::unique_ptr<Placeholder>
      holder_;  // pointers to a PlaceholderImpl object indeed.
+
+  // name_ is only meaningful with a Scope and accessible by it.
+  //
+  // NOTE: Please don't expose name_ by adding methods like
+  // Variable::Name or Scope::VarName!  A variable could have a human
+  // readable name or an auto-generated scope-unique name.  In the
+  // former case, the caller knows the name and doesn't need to access
+  // the name; in the latter case, the variable should be identified
+  // by its address but not the unreadable name.
+  friend class Scope;
+  const std::string* name_;
 };

 }  // namespace framework

--- a/paddle/gserver/layers/SliceProjection.cpp
+++ b/paddle/gserver/layers/SliceProjection.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * SliceProjection can slice the input value into multiple parts,
+ * and then select some of them to merge into a new output.
+ *
+ * First, calculate the slices that need to be merged into the output.
+ * slices = input.slices().for_output()
+ *
+ * Second, merge each slice into the output.
+ * for(auto slice: slices) {
+ *   out.addAtOffset(slice, offset);
+ * }
+ *
+ * Input slices as output: s0, s1, ...:
+ *   -----------------------
+ *   |///|   |//////|      |
+ *   |/s0|   |//s1//|      |
+ *   |///|   |//////|      |
+ *   -----------------------
+ * Output, merge s0, s1, ... into one output:
+ *   ----------------
+ *   |///|//////|   |
+ *   |/s0|//s1//|...|
+ *   |///|//////|   |
+ *   ----------------
+ *
+ * The config file api is slice_projection.
+ */
+class SliceProjection : public Projection {
+public:
+  SliceProjection(const ProjectionConfig& config,
+                  const ParameterPtr& parameter,
+                  bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  std::vector<std::pair<size_t, size_t>> slices_;
+};
+
+REGISTER_PROJECTION(slice, SliceProjection);
+
+/**
+ * Constructed function.
+ * @note SliceProjection should not have any parameter.
+ */
+SliceProjection::SliceProjection(const ProjectionConfig& config,
+                                 const ParameterPtr& parameter,
+                                 bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(!parameter) << "'slice' projection should not have any parameter";
+
+  slices_.reserve(config.slices_size());
+  for (const auto& slice : config.slices()) {
+    slices_.push_back(std::make_pair(slice.start(), slice.end()));
+  }
+}
+
+void SliceProjection::forward() {
+  size_t offset = 0;
+  for (auto& slice : slices_) {
+    auto slice_out = in_->value->subColMatrix(slice.first, slice.second);
+    out_->value->addAtOffset(*slice_out, offset);
+    offset += slice_out->getWidth();
+  }
+}
+
+void SliceProjection::backward(const UpdateCallback& callback) {
+  if (in_->grad) {
+    size_t offset = 0;
+    for (auto& slice : slices_) {
+      auto slice_out = in_->grad->subColMatrix(slice.first, slice.second);
+      slice_out->addAtOffset(*out_->grad, offset);
+      offset += slice_out->getWidth();
+    }
+  }
+}
+
+}  // namespace paddle
--- a/paddle/gserver/tests/concat_slice_a.conf
+++ b/paddle/gserver/tests/concat_slice_a.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+
+data = data_layer(name ="input", size=8*16*16)
+
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+proj1 = slice_projection(input=conv1, slices=[(0, 4), (4, 12)])
+
+proj2 = slice_projection(input=conv2, slices=[(1, 5), (5, 15)])
+
+concat = concat_layer(input=[proj1, proj2])
+
+outputs(concat)
+
--- a/paddle/gserver/tests/concat_slice_b.conf
+++ b/paddle/gserver/tests/concat_slice_b.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+
+data = data_layer(name ="input", size=8*16*16)
+
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+proj1 = slice_projection(input=conv1, slices=[(0, 12)])
+
+proj2 = slice_projection(input=conv2, slices=[(1, 15)])
+
+concat = concat_layer(input=[proj1, proj2])
+
+outputs(concat)
+
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -152,6 +152,26 @@ TEST(Projection, identity) {
  }
 }

+TEST(Projection, slice) {
+  ProjectionConfig conf;
+  conf.set_type("slice");
+  conf.set_input_size(100);
+  SliceConfig& slice1 = *conf.add_slices();
+  slice1.set_start(10);
+  slice1.set_end(20);
+  SliceConfig& slice2 = *conf.add_slices();
+  slice2.set_start(50);
+  slice2.set_end(70);
+  conf.set_output_size(30);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 10,
+                       useGpu);
+  }
+}
+
 TEST(Projection, scaling) {
  ProjectionConfig conf;
  conf.set_type("scaling");

--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -237,6 +237,12 @@ TEST(Compare, concat_table) {
  compareNetwork(config_file_a, config_file_b);
 }

+TEST(Compare, concat_slice) {
+  std::string config_file_a = "./gserver/tests/concat_slice_a.conf";
+  std::string config_file_b = "./gserver/tests/concat_slice_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
 #ifndef PADDLE_ONLY_CPU
 TEST(Compare, img_pool) {
  std::string config_file_a = "./gserver/tests/img_pool_a.conf";

--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1141,4 +1141,64 @@ TEST(CpuMatrix, copyFrom) {
  TensorCheckEqual(cpu, copy);
 }

+void testBatch2seqPadding(int batchSize, int inputDim) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  size_t numSeq = cpuSequence->getSize() - 1;
+  size_t maxSeqLen = *std::max_element(cpuSequence->getData(),
+                                       cpuSequence->getData() + numSeq);
+
+  MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+
+  hl_sequence2batch_copy_padding(gBatch->getData(),
+                                 gpuInput->getData(),
+                                 cpuSequence->getData(),
+                                 inputDim,
+                                 maxSeqLen,
+                                 numSeq,
+                                 false,
+                                 true);
+  cCheck->copyFrom(*gBatch);
+
+  int* seqStart = cpuSequence->getData();
+  float* batchData = cBatch->getData();
+  float* seqData = cpuInput->getData();
+  for (size_t i = 0; i < maxSeqLen; i++) {
+    for (size_t j = 0; j < numSeq; j++) {
+      size_t sequenceStart = seqStart[j];
+      size_t sequenceLength = seqStart[j + 1] - seqStart[j];
+      if (i < sequenceLength) {
+        memcpy(batchData + (i * numSeq + j) * inputDim,
+               seqData + (sequenceStart + i) * inputDim,
+               inputDim * sizeof(real));
+      } else {
+        memset(batchData + (i * numSeq + j) * inputDim,
+               0,
+               inputDim * sizeof(real));
+      }
+    }
+  }
+
+  TensorCheckErr(*cBatch, *cCheck);
+}
+
+TEST(Matrix, warpCTC) {
+  for (auto batchSize : {51, 526, 2884}) {
+    for (auto inputDim : {32, 512, 2026}) {
+      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
+      testBatch2seqPadding(batchSize, inputDim);
+    }
+  }
+}
+
 #endif
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -44,18 +44,26 @@ endfunction()
 op_library(add_op SRCS add_op.cc add_op.cu)
 cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)

+op_library(mean_op SRCS mean_op.cc mean_op.cu)
+cc_test(mean_op_test SRCS mean_op_test.cc DEPS mean_op)
+
 op_library(mul_op SRCS mul_op.cc mul_op.cu)
 op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
-op_library(sigmoid_op SRCS sigmoid_op.cu sigmoid_op.cc)
+
+op_library(sigmoid_op SRCS sigmoid_op.cc sigmoid_op.cu)
 op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
 op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
-
-op_library(fc_op SRCS fc_op.cc DEPS mul_op rowwise_add_op sigmoid_op
-        softmax_op net)
+op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu)

 op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)

-op_library(recurrent_network_op SRCS recurrent_network_op.cc DEPS op_desc
-tensor op_registry operator net)
-cc_test(recurrent_network_op_test SRCS recurrent_network_op_test.cc DEPS
-recurrent_network_op gtest mul_op add_op)
+op_library(fc_op
+    SRCS fc_op.cc
+    DEPS mul_op rowwise_add_op sigmoid_op softmax_op net)
+
+op_library(recurrent_network_op
+    SRCS recurrent_network_op.cc
+    DEPS op_desc tensor net)
+cc_test(recurrent_network_op_test
+    SRCS recurrent_network_op_test.cc
+    DEPS recurrent_network_op mul_op add_op)
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -19,16 +19,16 @@ namespace operators {

 class AddOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 2, "Input size of AddOp must be two");
-    PADDLE_ENFORCE(outputs.size() == 1, "Output size of AddOp must be one");
-    PADDLE_ENFORCE(
-        inputs[0] != nullptr && inputs[1] != nullptr && outputs[0] != nullptr,
-        "Inputs/Outputs of AddOp must all be set");
-    PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of AddOp must be two");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one");
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.InputVar(1) != nullptr,
+                   "Inputs of AddOp must all be set");
+    PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr,
+                   "Outputs of AddOp must all be set");
+    PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims() == ctx.Input<Tensor>(1)->dims(),
                   "Two input of Add Op's dimension must be same.");
-    outputs[0]->Resize(inputs[0]->dims());
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
  }
 };

@@ -49,8 +49,7 @@ The equation is: Out = X + Y

 class AddOpGrad : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {}
+  void InferShape(const InferShapeContext &ctx) const override {}
  std::string DebugString() const override {
    LOG(INFO) << "AddOpGrad";
    return "";

--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -21,16 +21,17 @@ namespace operators {
 template <typename Place, typename T>
 class AddKernel : public OpKernel {
 public:
-  void Compute(const KernelContext& context) const override {
-    auto input0 = context.Input(0)->Get<Tensor>();
-    auto input1 = context.Input(1)->Get<Tensor>();
-    auto output = context.Output(0)->GetMutable<Tensor>();
+  void Compute(const ExecutionContext& context) const override {
+    auto input0 = context.Input<Tensor>(0);
+    auto input1 = context.Input<Tensor>(1);
+    auto output = context.Output<Tensor>(0);

    output->mutable_data<T>(context.GetPlace());

    EigenVector<T>::Flatten(*output).device(
        *(context.GetEigenDevice<Place>())) =
-        EigenVector<T>::Flatten(input0) + EigenVector<T>::Flatten(input1);
+        framework::EigenVector<T>::Flatten(*input0) +
+        framework::EigenVector<T>::Flatten(*input1);
  }
 };


--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -19,20 +19,20 @@ namespace operators {

 class OnehotCrossEntropyOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 2,
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2,
                   "Input size of OnehotCrossEntropyOp must be two");
-    PADDLE_ENFORCE(outputs.size() == 1,
+    PADDLE_ENFORCE(ctx.OutputSize() == 1,
                   "Output size of OnehotCrossEntropyOp must be one");
-    PADDLE_ENFORCE(inputs[0] != nullptr && inputs[1] != nullptr,
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.InputVar(1) != nullptr,
                   "Inputs of OnehotCrossEntropyOp must all be set");
-    PADDLE_ENFORCE(outputs[0] != nullptr,
+    PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr,
                   "Outputs of OnehotCrossEntropyOp must all be set");
-    PADDLE_ENFORCE(inputs[0]->dims().size() == 2, "X's dimension must be 2.");
-    PADDLE_ENFORCE(outputs[0]->dims().size() == 1,
+    PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims().size() == 2,
+                   "X's dimension must be 2.");
+    PADDLE_ENFORCE(ctx.Output<Tensor>(0)->dims().size() == 1,
                   "label's dimension must be 1.");
-    outputs[0]->Resize({inputs[0]->dims()[0]});
+    ctx.Output<Tensor>(0)->Resize({ctx.Input<Tensor>(0)->dims()[0]});
  }
 };


--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -23,18 +23,18 @@ class OnehotCrossEntropyOpKernel : public OpKernel {
 public:
  constexpr T LOG_THRESHOLD() const { return static_cast<T>(1e-20); }

-  void Compute(const KernelContext& context) const override {
-    auto X = context.Input(0)->Get<Tensor>();
-    const T* X_data = X.data<T>();
-    const int* label_data = context.Input(1)->Get<Tensor>().data<int>();
-    auto* Y = context.Output(0)->GetMutable<Tensor>();
+  void Compute(const ExecutionContext& ctx) const override {
+    auto X = ctx.Input<Tensor>(0);
+    const T* X_data = X->data<T>();
+    const int* label_data = ctx.Input<Tensor>(1)->data<int>();
+    auto Y = ctx.Output<Tensor>(0);

-    Y->mutable_data<T>(context.GetPlace());
+    Y->mutable_data<T>(ctx.GetPlace());

    T* Y_data = Y->data<T>();

-    int batch_size = X.dims()[0];
-    int class_num = X.dims()[1];
+    int batch_size = X->dims()[0];
+    int class_num = X->dims()[1];

    // Y[i] = -log(X[i][j])
    for (int i = 0; i < batch_size; ++i) {

--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -50,8 +50,8 @@ public:
    AddInput("b", "the bias of fc operator");

    AddOutput("Y", "the output of fc operator");
-    AddOutput(
-        "before_act", "the before activation output of fc operator", true);
+    AddOutput("before_act", "the before activation output of fc operator")
+        .SetTemporary();
    AddAttr<std::string>("activation", "The activation key for fc layer")
        .SetDefault("sigmoid")
        .InEnum({"sigmoid", "softmax"});

--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/fill_zeros_like_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+class FillZerosLikeOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 1UL,
+                   "Input size of FillZerosLikeOp must be one.");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1UL,
+                   "Output size of AddOp must be one.");
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr,
+                   "Input of FillZerosLikeOp must be set.");
+    PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr,
+                   "Output of FillZerosLikeOp must be set.");
+    ctx.Output<framework::Tensor>(0)->Resize(
+        ctx.Input<framework::Tensor>(0)->dims());
+  }
+};
+
+class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  FillZerosLikeOpMaker(framework::OpProto *proto,
+                       framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Src", "The input of fill-zeros-like op.");
+    AddOutput("Dst", "The varibale will be filled up with zeros.");
+    AddComment(R"DOC(
+Fill up a vriable with zeros.
+
+The output will have the same size with input.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(fill_zeros_like,
+            paddle::operators::FillZerosLikeOp,
+            paddle::operators::FillZerosLikeOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fill_zeros_like,
+    paddle::operators::FillZerosLikeKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/fill_zeros_like_op.cu
+++ b/paddle/operators/fill_zeros_like_op.cu
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/fill_zeros_like_op.h"
+
+REGISTER_OP_GPU_KERNEL(
+    fill_zeros_like,
+    paddle::operators::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class FillZerosLikeKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* output = context.Output<framework::Tensor>(0);
+    output->mutable_data<T>(context.GetPlace());
+    framework::EigenVector<T>::Flatten(*output).setZero();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/mean_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MeanOp : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 1, "Input size of AddOp must be one");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one");
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.OutputVar(0) != nullptr,
+                   "Input/Output of MeanOp must be initialized.");
+    ctx.Output<Tensor>(0)->Resize(framework::make_ddim({1}));
+  }
+};
+
+class MeanOpMaker : public OpProtoAndCheckerMaker {
+public:
+  MeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of mean op");
+    AddOutput("Out", "The output of mean op");
+    AddComment("Mean Operator");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker);
+REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<ops::CPUPlace, float>);
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/mean_op.h"
+
+REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<ops::GPUPlace, float>);
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/type_alias.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class MeanKernel : public OpKernel {
+public:
+  void Compute(const ExecutionContext& context) const override {
+    auto input = context.Input<Tensor>(0);
+    auto output = context.Output<Tensor>(0);
+
+    output->mutable_data<T>(context.GetPlace());
+
+    EigenScalar<T>::From(*output).device(*(context.GetEigenDevice<Place>())) =
+        EigenVector<T>::Flatten(*input).mean();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/mean_op_test.cc
+++ b/paddle/operators/mean_op_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include <paddle/framework/op_registry.h>
+
+USE_OP(mean);
+
+TEST(MeanOp, GetOpProto) {
+  auto& protos = paddle::framework::OpRegistry::protos();
+  auto it = protos.find("mean");
+  ASSERT_NE(it, protos.end());
+}
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -19,18 +19,17 @@ namespace operators {

 class MulOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 2, "The mul op must take two inputs");
-    auto dim0 = inputs[0]->dims();
-    auto dim1 = inputs[1]->dims();
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2, "The mul op must take two inputs");
+    auto dim0 = ctx.Input<Tensor>(0)->dims();
+    auto dim1 = ctx.Input<Tensor>(1)->dims();
    PADDLE_ENFORCE(dim0.size() == 2 && dim1.size() == 2,
                   "The input of mul op must be matrix");
    PADDLE_ENFORCE(
        dim0[1] == dim1[0],
        "First matrix's width must be equal with second matrix's height.");
-    PADDLE_ENFORCE(outputs.size() == 1, "The mul op must take one output");
-    outputs[0]->Resize({dim0[0], dim1[1]});
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "The mul op must take one output");
+    ctx.Output<Tensor>(0)->Resize({dim0[0], dim1[1]});
  }
 };

@@ -51,8 +50,7 @@ The equation is: Out = X * Y

 class MulOpGrad : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {}
+  void InferShape(const InferShapeContext &ctx) const override {}
  std::string DebugString() const override {
    LOG(INFO) << "MulGrad";
    return "";

--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -22,19 +22,17 @@ namespace operators {
 template <typename Place, typename T>
 class MulKernel : public OpKernel {
 public:
-  void Compute(const KernelContext& context) const override {
+  void Compute(const ExecutionContext& context) const override {
    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
        {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};

-    auto input0 = context.Input(0)->Get<Tensor>();
-    auto input1 = context.Input(1)->Get<Tensor>();
-    auto* output = context.Output(0)->GetMutable<Tensor>();
-
+    auto output = context.Output<Tensor>(0);
    output->mutable_data<T>(context.GetPlace());

    EigenMatrix<T>::From(*output).device(*(context.GetEigenDevice<Place>())) =
-        EigenMatrix<T>::From(input0).contract(EigenMatrix<T>::From(input1),
-                                              dim_pair);
+        EigenMatrix<T>::From(*context.Input<Tensor>("X"))
+            .contract(EigenMatrix<T>::From(*context.Input<Tensor>("Y")),
+                      dim_pair);
  }
 };
 }  // namespace operators

--- a/paddle/operators/recurrent_network_op.cc
+++ b/paddle/operators/recurrent_network_op.cc
@@ -27,38 +27,37 @@ namespace operators {

 namespace rnn {

-void SegmentInputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
+void SegmentInputs(const std::vector<Scope*>& step_scopes,
                   const std::vector<Link>& inlinks,
                   const size_t seq_len) {
  PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
  for (size_t i = 0; i < inlinks.size(); ++i) {
    Tensor* input =
-        step_scopes[0]->GetVariable(inlinks[i].external)->GetMutable<Tensor>();
+        step_scopes[0]->FindVar(inlinks[i].external)->GetMutable<Tensor>();
    DDim dims = input->dims();
    PADDLE_ENFORCE(static_cast<size_t>(dims[0]) == seq_len,
                   "all the inlinks must have same length");
    DDim step_dims = slice_ddim(dims, 1, dims.size());
    for (size_t j = 0; j < seq_len; j++) {
-      Tensor* step_input = step_scopes[j]
-                               ->CreateVariable(inlinks[i].internal)
-                               ->GetMutable<Tensor>();
+      Tensor* step_input =
+          step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable<Tensor>();
      *step_input = input->Slice<float>(j, j + 1);
      step_input->Resize(step_dims);
    }
  }
 }

-void ConcatOutputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
+void ConcatOutputs(const std::vector<Scope*>& step_scopes,
                   const std::vector<Link>& outlinks,
                   const size_t seq_len) {
  for (size_t i = 0; i < outlinks.size(); i++) {
    Tensor* output =
-        step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable<Tensor>();
+        step_scopes[0]->FindVar(outlinks[i].external)->GetMutable<Tensor>();

    // TODO(qingiqng) remove following code after adding
    // InferShape in RecurrentGradientOp
    DDim step_dims = step_scopes[0]
-                         ->GetVariable(outlinks[i].internal)
+                         ->FindVar(outlinks[i].internal)
                         ->GetMutable<Tensor>()
                         ->dims();
    std::vector<int> dims_vec = vectorize(step_dims);
@@ -66,9 +65,8 @@ void ConcatOutputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
    output->mutable_data<float>(make_ddim(dims_vec), platform::CPUPlace());

    for (size_t j = 0; j < seq_len; j++) {
-      Tensor* step_output = step_scopes[j]
-                                ->GetVariable(outlinks[i].internal)
-                                ->GetMutable<Tensor>();
+      Tensor* step_output =
+          step_scopes[j]->FindVar(outlinks[i].internal)->GetMutable<Tensor>();
      // TODO(luotao02) data type and platform::DeviceContext() should set
      // correctly
      (output->Slice<float>(j, j + 1))
@@ -77,7 +75,7 @@ void ConcatOutputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
  }
 }

-void LinkMemories(std::vector<std::shared_ptr<Scope>>& scopes,
+void LinkMemories(const std::vector<Scope*>& scopes,
                  const std::vector<rnn::MemoryAttr>& memories,
                  size_t step_id,
                  int offset) {
@@ -94,17 +92,17 @@ void LinkMemories(std::vector<std::shared_ptr<Scope>>& scopes,
                 offset,
                 scopes.size(),
                 step_id);
-  std::shared_ptr<Scope> scope = scopes[step_id];
-  std::shared_ptr<Scope> linked_scope = scopes[step_id + offset];
+  auto scope = scopes[step_id];
+  auto linked_scope = scopes[step_id + offset];
  for (auto& attr : memories) {
-    auto mem = scope->CreateVariable(attr.pre_var)->GetMutable<Tensor>();
+    auto mem = scope->NewVar(attr.pre_var)->GetMutable<Tensor>();
    // maybe share variable is better?
-    auto linked_mem = linked_scope->GetVariable(attr.var)->GetMutable<Tensor>();
+    auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<Tensor>();
    mem->ShareDataWith<float>(*linked_mem);

    // TODO(qingqing) remove following code
    // the memory of current step should be allocated in step net
-    auto m = scope->CreateVariable(attr.var)->GetMutable<Tensor>();
+    auto m = scope->NewVar(attr.var)->GetMutable<Tensor>();
    // for unit test, as addOp and mulOp are null currently, if not
    // mutable_data, mem.data() in output will be error. We will
    // remove this line after merge the correct addOp and mulOp.
@@ -171,8 +169,8 @@ void InitArgument(const ArgumentName& name,

 }  // namespace rnn

-void RecurrentAlgorithm::InferShape(const std::shared_ptr<Scope>& scope) const {
-  seq_len_ = scope->GetVariable((arg_->inlinks[0]).external)
+void RecurrentAlgorithm::InferShape(const Scope& scope) const {
+  seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
                 ->GetMutable<Tensor>()
                 ->dims()[0];
  CreateScopes(scope);
@@ -187,10 +185,10 @@ void RecurrentAlgorithm::InferShape(const std::shared_ptr<Scope>& scope) const {

  InitMemories(step_scopes[0]);

-  PADDLE_ENFORCE(scope->HasVariable(arg_->step_net),
+  PADDLE_ENFORCE(scope.FindVar(arg_->step_net) != nullptr,
                 "stepnet [%s] is not in scope.",
                 arg_->step_net);
-  Variable* net = scope->GetVariable(arg_->step_net);
+  Variable* net = scope.FindVar(arg_->step_net);
  PADDLE_ENFORCE(net != nullptr, "failed to get step net");
  // If the InferShape is called in OperatorBase's run function,
  // the rnn op only needs to do InferShape for the first time step
@@ -198,82 +196,79 @@ void RecurrentAlgorithm::InferShape(const std::shared_ptr<Scope>& scope) const {
    if (i > 0) {
      rnn::LinkMemories(step_scopes, arg_->memories, i, -1);
    }
-    net->GetMutable<NetOp>()->InferShape(step_scopes[i]);
+    net->GetMutable<NetOp>()->InferShape(*step_scopes[i]);
  }

  auto outlinks = arg_->outlinks;
  for (size_t i = 0; i < outlinks.size(); i++) {
    DDim step_dims = step_scopes[0]
-                         ->GetVariable(outlinks[i].internal)
+                         ->FindVar(outlinks[i].internal)
                         ->GetMutable<Tensor>()
                         ->dims();
    std::vector<int> dims_vec = vectorize(step_dims);
    // now only support fixed length
    dims_vec.insert(dims_vec.begin(), seq_len_);
    Tensor* output =
-        step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable<Tensor>();
+        step_scopes[0]->FindVar(outlinks[i].external)->GetMutable<Tensor>();
    output->Resize(make_ddim(dims_vec));
  }
 }

-void RecurrentAlgorithm::Run(const std::shared_ptr<Scope>& scope,
+void RecurrentAlgorithm::Run(const Scope& scope,
                             const platform::DeviceContext& dev_ctx) const {
  auto step_scopes = GetStepScopes(scope);

-  Variable* net = scope->GetVariable(arg_->step_net);
+  Variable* net = scope.FindVar(arg_->step_net);
  for (size_t step_id = 0; step_id < seq_len_; step_id++) {
    // the link memory is done in InferShape
    // maybe remove following code after testing
    if (step_id > 0) {
      rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1);
    }
-    net->GetMutable<NetOp>()->Run(step_scopes[step_id], dev_ctx);
+    net->GetMutable<NetOp>()->Run(*step_scopes[step_id], dev_ctx);
  }

  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_);
 }

-void RecurrentAlgorithm::CreateScopes(std::shared_ptr<Scope> scope) const {
+void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
  // TODO(xxx) Only two scopes are needed for inference, this case will be
  // supported later.
-  auto step_scopes = scope->GetVariable(arg_->step_scopes)
-                         ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+  auto step_scopes =
+      scope.FindVar(arg_->step_scopes)->GetMutable<std::vector<Scope*>>();

  if (seq_len_ > step_scopes->size()) {
    for (size_t i = step_scopes->size(); i < seq_len_; ++i) {
-      std::shared_ptr<Scope> step_scope = std::make_shared<Scope>(scope);
+      auto& step_scope = scope.NewScope();

      // Now all variables in scope must be created outside of op.
-      auto net_op = scope->GetVariable(arg_->step_net)->GetMutable<NetOp>();
+      auto net_op = scope.FindVar(arg_->step_net)->GetMutable<NetOp>();
      for (auto& input : net_op->inputs_) {
-        step_scope->CreateVariable(input);
+        if (!step_scope.FindVar(input)) step_scope.NewVar(input);
      }
      for (auto& output : net_op->outputs_) {
-        step_scope->CreateVariable(output);
+        step_scope.NewVar(output);
      }

-      step_scopes->push_back(std::make_shared<Scope>(step_scope));
+      step_scopes->emplace_back(&step_scope);
    }
  }
 }

-void RecurrentAlgorithm::InitMemories(std::shared_ptr<Scope> step_scope) const {
+void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
  for (auto& attr : arg_->memories) {
-    Tensor* pre_mem =
-        step_scope->CreateVariable(attr.pre_var)->GetMutable<Tensor>();
-    PADDLE_ENFORCE(step_scope->HasVariable(attr.boot_var),
+    Tensor* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<Tensor>();
+    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                   "memory [%s]'s boot variable [%s] not exists",
                   attr.var,
                   attr.boot_var);
-    Tensor* boot_mem =
-        step_scope->GetVariable(attr.boot_var)->GetMutable<Tensor>();
+    Tensor* boot_mem = step_scope->FindVar(attr.boot_var)->GetMutable<Tensor>();
    pre_mem->ShareDataWith<float>(*boot_mem);

    // TODO(qingqing) remove following code
    // the memory of current step should be allocated in step net
    // here for unit test
-    auto cur_step_mem =
-        step_scope->CreateVariable(attr.var)->GetMutable<Tensor>();
+    auto cur_step_mem = step_scope->NewVar(attr.var)->GetMutable<Tensor>();
    cur_step_mem->mutable_data<float>(boot_mem->dims(), platform::CPUPlace());
  }
 }
@@ -312,13 +307,14 @@ public:
      : OpProtoAndCheckerMaker(proto, op_checker) {
    const auto& name = RecurrentOp::kArgName;
    // inputs and outputs stored in proto
-    AddInputs(name.inlinks,
-              "the input that need to be segmented for each step.");
-    AddInputs(name.boot_memories, "variables to initialize memories.");
+    AddInput(name.inlinks, "the input that need to be segmented for each step.")
+        .SetMultiple();
+    AddInput(name.boot_memories, "variables to initialize memories.")
+        .SetMultiple();
    AddInput(name.step_net, "network shared by all steps.");

-    AddOutputs(name.outlinks,
-               "the output that need to concated for all steps.");
+    AddOutput(name.outlinks, "the output that need to concated for all steps.")
+        .SetMultiple();
    AddOutput(name.step_scopes, "step scopes");

    // Attributes stored in AttributeMap
@@ -333,72 +329,69 @@ public:
 };

 void RecurrentGradientAlgorithm::Run(
-    const std::shared_ptr<Scope>& scope,
-    const platform::DeviceContext& dev_ctx) const {
+    const Scope& scope, const platform::DeviceContext& dev_ctx) const {
  auto step_scopes = GetStepScopes(scope);
  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_);
-  PADDLE_ENFORCE(scope->HasVariable(arg_->step_net),
+  PADDLE_ENFORCE(scope.FindVar(arg_->step_net) != nullptr,
                 "step net is not in scope.");
-  Variable* net = scope->GetVariable(arg_->step_net);
+  Variable* net = scope.FindVar(arg_->step_net);
  PADDLE_ENFORCE(net != nullptr, "failed to get step net");
  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1);
    }
-    net->GetMutable<NetOp>()->Run(step_scopes[step_id], dev_ctx);
+    net->GetMutable<NetOp>()->Run(*step_scopes[step_id], dev_ctx);
  }
  LinkBootMemoryGradients(step_scopes[0]);
  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_);
 }

 void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
-    std::shared_ptr<Scope> step_scope) const {
+    Scope* step_scope) const {
  for (auto& attr : arg_->memories) {
-    Tensor* mem_grad =
-        step_scope->CreateVariable(attr.var)->GetMutable<Tensor>();
+    Tensor* mem_grad = step_scope->NewVar(attr.var)->GetMutable<Tensor>();
    PADDLE_ENFORCE(mem_grad != nullptr,
                   "boot_tensor should be retrieved before");
-    PADDLE_ENFORCE(step_scope->HasVariable(attr.boot_var),
+    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                   "memory [%s]'s boot variable [%s] not exists",
                   attr.var,
                   attr.boot_var);
    Tensor* boot_mem_grad =
-        step_scope->CreateVariable(attr.boot_var)->GetMutable<Tensor>();
+        step_scope->NewVar(attr.boot_var)->GetMutable<Tensor>();
    boot_mem_grad->ShareDataWith<float>(*mem_grad);
  }
 }

-void RecurrentGradientAlgorithm::InferShape(
-    const std::shared_ptr<Scope>& scope) const {
-  seq_len_ = scope->GetVariable((arg_->inlinks[0]).external)
+void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
+  seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
                 ->GetMutable<Tensor>()
                 ->dims()[0];
  auto step_scopes = GetStepScopes(scope);
  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_);

-  PADDLE_ENFORCE(scope->HasVariable(arg_->step_net),
+  PADDLE_ENFORCE(scope.FindVar(arg_->step_net) != nullptr,
                 "step net is not in scope.");
-  Variable* net = scope->GetVariable(arg_->step_net);
+  Variable* net = scope.FindVar(arg_->step_net);
  PADDLE_ENFORCE(net != nullptr, "failed to get step net");

  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1);
    }
-    net->GetMutable<NetOp>()->InferShape(step_scopes[step_id]);
+    net->GetMutable<NetOp>()->InferShape(*step_scopes[step_id]);
  }

  auto outlinks = arg_->outlinks;
  for (size_t i = 0; i < outlinks.size(); i++) {
    DDim step_dims = step_scopes[0]
-                         ->GetVariable(outlinks[i].internal)
+                         ->FindVar(outlinks[i].internal)
                         ->GetMutable<Tensor>()
                         ->dims();
    std::vector<int> dims_vec = vectorize(step_dims);
    // now only support fixed length
    dims_vec.insert(dims_vec.begin(), seq_len_);
    Tensor* output =
-        step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable<Tensor>();
+        step_scopes[0]->FindVar(outlinks[i].external)->GetMutable<Tensor>();
    output->Resize(make_ddim(dims_vec));
  }
  LinkBootMemoryGradients(step_scopes[0]);

--- a/paddle/operators/recurrent_network_op.h
+++ b/paddle/operators/recurrent_network_op.h
@@ -70,18 +70,18 @@ struct ArgumentName {
 /**
 * Prepare inputs for each step net.
 */
-void SegmentInputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
+void SegmentInputs(const std::vector<Scope*>& step_scopes,
                   const std::vector<Link>& inlinks,
                   const size_t seq_len);

 /**
 * Process outputs of step nets and merge to variables.
 */
-void ConcatOutputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
+void ConcatOutputs(const std::vector<Scope*>& step_scopes,
                   const std::vector<Link>& outlinks,
                   const size_t seq_len);

-void LinkMemories(std::vector<std::shared_ptr<Scope>>& step_scopes,
+void LinkMemories(const std::vector<Scope*>& step_scopes,
                  const std::vector<MemoryAttr>& memories,
                  size_t step_id,
                  int offset);
@@ -100,15 +100,14 @@ void InitArgument(const ArgumentName& name, Argument* arg);

 class RecurrentAlgorithm {
 public:
-  void Run(const std::shared_ptr<Scope>& scope,
-           const platform::DeviceContext& dev_ctx) const;
+  void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const;

  void Init(std::unique_ptr<rnn::Argument> arg) { arg_ = std::move(arg); }

  /**
   * InferShape must be called before Run.
   */
-  void InferShape(const std::shared_ptr<Scope>& scope) const;
+  void InferShape(const Scope& scope) const;

 protected:
  /*
@@ -117,15 +116,13 @@ protected:
   * NOTE the scopes are reused in both the forward and backward, so just
   * create once and expand its size if more steps need.
   */
-  void CreateScopes(std::shared_ptr<Scope> scope) const;
+  void CreateScopes(const Scope& scope) const;

-  inline const std::vector<std::shared_ptr<Scope>>& GetStepScopes(
-      std::shared_ptr<Scope> scope) const {
-    return *(scope->GetVariable(arg_->step_scopes))
-                ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+  const std::vector<Scope*>& GetStepScopes(const Scope& scope) const {
+    return *scope.FindVar(arg_->step_scopes)->GetMutable<std::vector<Scope*>>();
  }

-  void InitMemories(std::shared_ptr<Scope> step_scopes) const;
+  void InitMemories(Scope* step_scopes) const;

 private:
  std::unique_ptr<rnn::Argument> arg_;
@@ -146,21 +143,18 @@ class RecurrentGradientAlgorithm {
 public:
  void Init(std::unique_ptr<rnn::Argument> arg) { arg_ = std::move(arg); }

-  void Run(const std::shared_ptr<Scope>& scope,
-           const platform::DeviceContext& dev_ctx) const;
+  void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const;

-  void LinkBootMemoryGradients(std::shared_ptr<Scope> step_scopes) const;
+  void LinkBootMemoryGradients(Scope* step_scopes) const;

  /**
   * InferShape must be called before Run.
   */
-  void InferShape(const std::shared_ptr<Scope>& scope) const;
+  void InferShape(const Scope& scope) const;

 protected:
-  inline const std::vector<std::shared_ptr<Scope>>& GetStepScopes(
-      std::shared_ptr<Scope> scope) const {
-    return *(scope->GetVariable(arg_->step_scopes))
-                ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+  inline const std::vector<Scope*>& GetStepScopes(const Scope& scope) const {
+    return *scope.FindVar(arg_->step_scopes)->GetMutable<std::vector<Scope*>>();
  }

 private:
@@ -175,11 +169,11 @@ public:
  /**
   * InferShape must be called before Run.
   */
-  virtual void InferShape(const std::shared_ptr<Scope>& scope) const override {
+  virtual void InferShape(const Scope& scope) const override {
    alg_.InferShape(scope);
  }

-  virtual void Run(const std::shared_ptr<Scope>& scope,
+  virtual void Run(const Scope& scope,
                   const platform::DeviceContext& dev_ctx) const override {
    alg_.Run(scope, dev_ctx);
  }
@@ -197,11 +191,11 @@ public:
  /**
   * InferShape must be called before Run.
   */
-  virtual void InferShape(const std::shared_ptr<Scope>& scope) const override {
+  virtual void InferShape(const Scope& scope) const override {
    alg_.InferShape(scope);
  }

-  virtual void Run(const std::shared_ptr<Scope>& scope,
+  virtual void Run(const Scope& scope,
                   const platform::DeviceContext& dev_ctx) const override {
    alg_.Run(scope, dev_ctx);
  }

--- a/paddle/operators/recurrent_network_op_test.cc
+++ b/paddle/operators/recurrent_network_op_test.cc
@@ -34,41 +34,40 @@ protected:
  virtual void TearDown() override {}

  void CreateGlobalVariables() {
-    scope_ = std::make_shared<Scope>();
    // create input, and init content
    LOG(INFO) << "create global variable x";
    for (auto inlink : std::vector<std::string>{"x", "x0", "x1", "h"}) {
-      Variable* x = scope_->CreateVariable(inlink);
+      Variable* x = scope_.NewVar(inlink);
      DDim dims = make_ddim(std::vector<int>{
          10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/});
      x->GetMutable<Tensor>()->mutable_data<float>(dims, platform::CPUPlace());
    }
    // create output alias just for test
    for (auto inlink : std::vector<std::string>{"h@alias"}) {
-      Variable* x = scope_->CreateVariable(inlink);
+      Variable* x = scope_.NewVar(inlink);
      DDim dims =
          make_ddim(std::vector<int>{20 /*batch size*/, 30 /*input dim*/});
      x->GetMutable<Tensor>()->mutable_data<float>(dims, platform::CPUPlace());
    }

    LOG(INFO) << "create global variable w";
-    Variable* w = scope_->CreateVariable("rnn/w");
+    Variable* w = scope_.NewVar("rnn/w");
    w->GetMutable<Tensor>()->mutable_data<float>(
        make_ddim(std::vector<int>{30, 30}), platform::CPUPlace());

    for (auto boot : std::vector<std::string>{"x_boot", "h_boot"}) {
      LOG(INFO) << "create global variable " << boot;
-      Variable* h_boot = scope_->CreateVariable(boot);
+      Variable* h_boot = scope_.NewVar(boot);
      h_boot->GetMutable<Tensor>()->mutable_data<float>(
          make_ddim(std::vector<int>{20 /*batch size*/, 30 /*input dim*/}),
          platform::CPUPlace());
    }

    LOG(INFO) << "create variable step_scopes";
-    scope_->CreateVariable("step_scopes");
+    scope_.NewVar("step_scopes");

    LOG(INFO) << "create variable h";
-    scope_->CreateVariable("h");
+    scope_.NewVar("h");
  }

  void CreateRNNOp() {
@@ -150,7 +149,7 @@ protected:

  void CreateStepNet() {
    LOG(INFO) << "create variable step_net";
-    Variable* var = scope_->CreateVariable("step_net");
+    Variable* var = scope_.NewVar("step_net");
    auto net = var->GetMutable<NetOp>();
    // rnn/s is net's input or output?
    net->inputs_ = {"rnn/h@pre", "rnn/w", "rnn/x"};
@@ -164,7 +163,7 @@ protected:
  }

  // father scope
-  std::shared_ptr<Scope> scope_;
+  Scope scope_;
  std::shared_ptr<OperatorBase> rnn_op_;
 };

@@ -191,68 +190,64 @@ protected:
  virtual void TearDown() override {}

  void CreateGlobalVariables() {
-    scope_ = std::make_shared<Scope>();
    // inputs: x
    LOG(INFO) << "create global variable x";
-    Variable* x = scope_->CreateVariable("x");
+    Variable* x = scope_.NewVar("x");
    DDim dims =
        make_ddim({10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/});
    x->GetMutable<Tensor>()->mutable_data<float>(dims, platform::CPUPlace());
    // inputs: h_boot
    LOG(INFO) << "create global variable h_boot";
-    Variable* h_boot = scope_->CreateVariable("h_boot");
+    Variable* h_boot = scope_.NewVar("h_boot");
    h_boot->GetMutable<Tensor>()->mutable_data<float>(
        make_ddim({20 /*batch size*/, 30 /*input dim*/}), platform::CPUPlace());
    // inputs: w
    LOG(INFO) << "create global variable w";
-    Variable* w = scope_->CreateVariable("rnn/w");
+    Variable* w = scope_.NewVar("rnn/w");
    w->GetMutable<Tensor>()->mutable_data<float>(make_ddim({30, 30}),
                                                 platform::CPUPlace());
    // inputs: h_grad
    LOG(INFO) << "create variable h_grad";
-    Variable* dh = scope_->CreateVariable("h_grad");
+    Variable* dh = scope_.NewVar("h_grad");
    dh->GetMutable<Tensor>()->mutable_data<float>(make_ddim({10, 20, 30}),
                                                  platform::CPUPlace());
    // inputs: step_scopes
    LOG(INFO) << "create variable step_scopes";
-    scope_->CreateVariable("step_scopes");
+    scope_.NewVar("step_scopes");
    // inputs: step_net
    LOG(INFO) << "create variable step_net";
-    scope_->CreateVariable("step_net");
+    scope_.NewVar("step_net");
    // outputs: w_grad
    LOG(INFO) << "create global variable w_grad";
-    scope_->CreateVariable("rnn/w_grad");
+    scope_.NewVar("rnn/w_grad");
    // outputs: x_grad
    LOG(INFO) << "create global variable x_grad";
-    scope_->CreateVariable("x_grad");
+    scope_.NewVar("x_grad");
    // outputs: h_boot_grad
    LOG(INFO) << "create global variable h_boot_grad";
-    scope_->CreateVariable("h_boot_grad");
+    scope_.NewVar("h_boot_grad");
  }

  void CreateStepScopes() {
-    std::vector<std::shared_ptr<Scope>>* step_scopes =
-        scope_->GetVariable("step_scopes")
-            ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+    auto step_scopes =
+        scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>();
    for (int i = 0; i < 10; ++i) {
-      auto scope = std::make_shared<Scope>(scope_);
-      auto pre_t = scope->CreateVariable("rnn/pre_h")->GetMutable<Tensor>();
-      pre_t->mutable_data<float>(make_ddim({20, 30}), platform::CPUPlace());
-      auto tensor = scope->CreateVariable("rnn/h")->GetMutable<Tensor>();
-      tensor->mutable_data<float>(make_ddim({20, 30}), platform::CPUPlace());
+      auto& scope = scope_.NewScope();
+      auto pre_t = scope.NewVar("rnn/pre_h")->GetMutable<Tensor>();
+      pre_t->mutable_data<float>({20, 30}, platform::CPUPlace());
+      auto tensor = scope.NewVar("rnn/h")->GetMutable<Tensor>();
+      tensor->mutable_data<float>({20, 30}, platform::CPUPlace());

      // for unit test of ConcatOutputs
-      auto xg = scope->CreateVariable("rnn/x_grad")->GetMutable<Tensor>();
-      xg->mutable_data<float>(make_ddim({20, 30}), platform::CPUPlace());
+      auto xg = scope.NewVar("rnn/x_grad")->GetMutable<Tensor>();
+      xg->mutable_data<float>({20, 30}, platform::CPUPlace());

-      step_scopes->push_back(scope);
+      step_scopes->emplace_back(&scope);
    }

    // last time step
-    auto g = (*step_scopes)[9]
-                 ->CreateVariable("rnn/h_pre_grad")
-                 ->GetMutable<Tensor>();
-    g->mutable_data<float>(make_ddim({20, 30}), platform::CPUPlace());
+    auto g = (*step_scopes)[9]->NewVar("rnn/h_pre_grad")->GetMutable<Tensor>();
+    g->mutable_data<float>({20, 30}, platform::CPUPlace());
  }

  void CreateRNNGradientAlgorithm() {
@@ -280,7 +275,7 @@ protected:

  void CreateStepNet() {
    LOG(INFO) << "create variable step_net";
-    Variable* var = scope_->CreateVariable("step_net");
+    Variable* var = scope_.NewVar("step_net");
    auto net = var->GetMutable<NetOp>();
    net->AddOp(OpRegistry::CreateOp("mul",
                                    {"rnn/h_pre", "rnn/w", "rnn/s_grad"},
@@ -300,9 +295,8 @@ protected:
    rnn::Link inlink;
    inlink.external = "x";
    inlink.internal = "rnn/x";
-    std::vector<std::shared_ptr<Scope>>* step_scopes =
-        scope_->GetVariable("step_scopes")
-            ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+    auto step_scopes =
+        scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>();
    rnn::SegmentInputs(*step_scopes, std::vector<rnn::Link>{inlink}, 10);
  }

@@ -314,15 +308,14 @@ protected:
    mem_attr.boot_var = "boot_h";
    std::vector<rnn::MemoryAttr> memories;
    memories.push_back(mem_attr);
-    std::vector<std::shared_ptr<Scope>>* step_scopes =
-        scope_->GetVariable("step_scopes")
-            ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+    auto step_scopes =
+        scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>();
    for (int i = 1; i < 10; ++i) {
      rnn::LinkMemories(*step_scopes, memories, i, -1);
    }
  }

-  std::shared_ptr<Scope> scope_;
+  Scope scope_;
  RecurrentGradientAlgorithm rnn_grad_algo_;
 };

@@ -341,14 +334,14 @@ TEST(RecurrentOp, LinkMemories) {

  // create and init step scopes
  int len = 10;
-  std::vector<std::shared_ptr<Scope>> step_scopes;
+  std::vector<Scope*> step_scopes;
  for (int i = 0; i < len; ++i) {
-    auto scope = std::make_shared<Scope>();
-    scope->CreateVariable("pre_h");
-    auto tensor = scope->CreateVariable("h")->GetMutable<Tensor>();
-    float* data = tensor->mutable_data<float>(make_ddim({15, 20}), CPUPlace());
-    for (int i = 0; i < 15 * 20; ++i) {
-      data[i] = rand() * (1. / (double)RAND_MAX);
+    auto scope = new Scope();
+    scope->NewVar("pre_h");
+    auto tensor = scope->NewVar("h")->GetMutable<Tensor>();
+    float* data = tensor->mutable_data<float>({15, 20}, CPUPlace());
+    for (int j = 0; j < 15 * 20; ++j) {
+      data[j] = rand() * (1. / (double)RAND_MAX);
    }
    step_scopes.push_back(scope);
  }
@@ -367,9 +360,9 @@ TEST(RecurrentOp, LinkMemories) {
  // check
  for (int i = 0; i < len - 1; ++i) {
    const float* a =
-        step_scopes[i]->GetVariable("h")->GetMutable<Tensor>()->data<float>();
+        step_scopes[i]->FindVar("h")->GetMutable<Tensor>()->data<float>();
    const float* b = step_scopes[i + 1]
-                         ->GetVariable("pre_h")
+                         ->FindVar("pre_h")
                         ->GetMutable<Tensor>()
                         ->data<float>();
    for (size_t i = 0; i < 15 * 20; ++i) {
@@ -382,19 +375,25 @@ TEST(RecurrentOp, LinkMemories) {
  }
  // check
  for (int i = len - 2; i >= 0; --i) {
-    const float* a = step_scopes[i]
-                         ->GetVariable("pre_h")
-                         ->GetMutable<Tensor>()
-                         ->data<float>();
-    const float* b = step_scopes[i + 1]
-                         ->GetVariable("h")
-                         ->GetMutable<Tensor>()
-                         ->data<float>();
+    const float* a =
+        step_scopes[i]->FindVar("pre_h")->GetMutable<Tensor>()->data<float>();
+    const float* b =
+        step_scopes[i + 1]->FindVar("h")->GetMutable<Tensor>()->data<float>();
    for (size_t i = 0; i < 15 * 20; ++i) {
      ASSERT_FLOAT_EQ(a[i], b[i]);
    }
  }
+
+  for (auto s : step_scopes) {
+    delete s;
+  }
 }

 USE_OP(add_two);
 USE_OP(mul);
+
+// int main() {
+//  //! TODO(yuyang18): Temporary disable this unit-test because implementation
+//  //! error.
+//  return 0;
+//}
\ No newline at end of file
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -18,17 +18,17 @@ namespace operators {

 class RowWiseAddOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 2UL, "Two inputs is needed by rowwise add");
-    auto dim0 = inputs[0]->dims();
-    auto dim1 = inputs[1]->dims();
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2UL,
+                   "Two inputs is needed by rowwise add");
+    auto dim0 = ctx.Input<Tensor>(0)->dims();
+    auto dim1 = ctx.Input<Tensor>(1)->dims();

    PADDLE_ENFORCE(dim0.size() == 2, "Input 0 must be matrix");
    PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
    PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
-    PADDLE_ENFORCE(outputs.size() == 1, "The output size must be 1");
-    outputs[0]->Resize(inputs[0]->dims());
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "The output size must be 1");
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
  }
 };


--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -21,14 +21,12 @@ namespace operators {
 template <typename Place, typename T>
 class RowWiseAddKernel : public OpKernel {
 public:
-  void Compute(const KernelContext& context) const override {
-    auto in0 = context.Input(0)->Get<Tensor>();
-    auto in1 = context.Input(1)->Get<Tensor>();
-    auto* out = context.Output(0)->GetMutable<Tensor>();
+  void Compute(const ExecutionContext& context) const override {
+    auto out = context.Output<Tensor>(0);
    out->mutable_data<T>(context.GetPlace());

-    auto input = EigenMatrix<T>::From(in0);
-    auto bias = EigenVector<T>::From(in1);
+    auto input = EigenMatrix<T>::From(*context.Input<Tensor>(0));
+    auto bias = EigenVector<T>::From(*context.Input<Tensor>(1));
    auto output = EigenMatrix<T>::From(*out);

    const int bias_size = bias.dimension(0);

--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -19,16 +19,15 @@ namespace operators {

 class SGDOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 2, "Input size of SGDOp must be two");
-    PADDLE_ENFORCE(outputs.size() == 1, "Output size of SGDOp must be one");
-    PADDLE_ENFORCE(inputs[0] != nullptr, "inputs[0] mast be set");
-    PADDLE_ENFORCE(inputs[1] != nullptr, "inputs[1] mast be set");
-    PADDLE_ENFORCE(outputs[0] != nullptr, "outputs[0] mast be set");
-    PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of SGDOp must be two");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of SGDOp must be one");
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr, "inputs[0] mast be set");
+    PADDLE_ENFORCE(ctx.InputVar(1) != nullptr, "inputs[1] mast be set");
+    PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr, "outputs[0] mast be set");
+    PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims() == ctx.Input<Tensor>(1)->dims(),
                   "Two input of SGD Op's dimension must be same.");
-    outputs[0]->Resize(inputs[0]->dims());
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
  }
 };


--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -21,16 +21,16 @@ namespace operators {
 template <typename Place, typename T>
 class SGDOpKernel : public OpKernel {
 public:
-  void Compute(const KernelContext& ctx) const override {
-    auto param = ctx.Input("param")->Get<Tensor>();
-    auto grad = ctx.Input("grad")->Get<Tensor>();
-    auto* param_out = ctx.Output(0)->GetMutable<Tensor>();
+  void Compute(const ExecutionContext& ctx) const override {
+    auto param = ctx.Input<Tensor>("param");
+    auto grad = ctx.Input<Tensor>("grad");
+    auto param_out = ctx.Output<Tensor>(0);
    float lr = ctx.op_.GetAttr<float>("learning_rate");

    param_out->mutable_data<T>(ctx.GetPlace());

    EigenVector<T>::Flatten(*param_out).device(*(ctx.GetEigenDevice<Place>())) =
-        EigenVector<T>::Flatten(param) - lr * EigenVector<T>::Flatten(grad);
+        EigenVector<T>::Flatten(*param) - lr * EigenVector<T>::Flatten(*grad);
  }
 };


--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -18,11 +18,10 @@ namespace operators {

 class SigmoidOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input");
-    PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output");
-    outputs[0]->Resize(inputs[0]->dims());
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 1, "Sigmoid Op only have one input");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Sigmoid Op only have one output");
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
  }
 };

@@ -38,8 +37,7 @@ public:

 class SigmoidOpGrad : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {}
+  void InferShape(const InferShapeContext &ctx) const override {}
  std::string DebugString() const override {
    LOG(INFO) << "SigmoidGrad";
    return "";

--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -22,15 +22,14 @@ namespace operators {
 template <typename Place, typename T>
 class SigmoidKernel : public OpKernel {
 public:
-  void Compute(const KernelContext& context) const override {
-    auto input = context.Input(0)->Get<Tensor>();
-    auto* output = context.Output(0)->GetMutable<Tensor>();
-
+  void Compute(const ExecutionContext& context) const override {
+    auto input = context.Input<Tensor>(0);
+    auto output = context.Output<Tensor>(0);
    output->mutable_data<T>(context.GetPlace());

    EigenVector<T>::Flatten(*output).device(
        *(context.GetEigenDevice<Place>())) =
-        1.0 / (1.0 + (-1.0 * EigenVector<T>::Flatten(input)).exp());
+        1.0 / (1.0 + (-1.0 * EigenVector<T>::Flatten(*input)).exp());
  }
 };
 }  // namespace operators

--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -18,14 +18,13 @@ namespace operators {

 class SoftmaxOp : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {
-    PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
-    PADDLE_ENFORCE(inputs[0]->dims().size() == 2,
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 1, "Only one input is need for softmax");
+    PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims().size() == 2,
                   "The input of softmax op must be matrix");
-    PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax");
-
-    outputs[0]->Resize(inputs[0]->dims());
+    PADDLE_ENFORCE(ctx.OutputSize() == 1,
+                   "Only one output is need for softmax");
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
  }
 };

@@ -41,8 +40,7 @@ public:

 class SoftmaxOpGrad : public OperatorWithKernel {
 protected:
-  void InferShape(const std::vector<const Tensor *> &inputs,
-                  const std::vector<Tensor *> &outputs) const override {}
+  void InferShape(const InferShapeContext &ctx) const override {}
  std::string DebugString() const override {
    LOG(INFO) << "SoftmaxOpGrad";
    return "";

--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -22,12 +22,12 @@ namespace operators {
 template <typename Place, typename T>
 class SoftmaxKernel : public OpKernel {
 public:
-  void Compute(const KernelContext& context) const override {
-    auto input = context.Input(0)->Get<Tensor>();
-    auto* output = context.Output(0)->GetMutable<Tensor>();
+  void Compute(const ExecutionContext& context) const override {
+    auto input = context.Input<Tensor>(0);
+    auto output = context.Output<Tensor>(0);
    output->mutable_data<T>(context.GetPlace());

-    auto logits = EigenMatrix<T>::From(input);
+    auto logits = EigenMatrix<T>::From(*input);
    auto softmax = EigenMatrix<T>::From(*output);

    const int kBatchDim = 0;

--- a/paddle/operators/type_alias.h
+++ b/paddle/operators/type_alias.h
@@ -22,7 +22,13 @@ namespace paddle {
 namespace operators {

 using OpKernel = framework::OpKernel;
-using KernelContext = framework::KernelContext;
+using InferShapeContext = framework::InferShapeContext;
+using ExecutionContext = framework::ExecutionContext;
+using Variable = framework::Variable;
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
 template <typename T,
          int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>

--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <execinfo.h>
 #include <paddle/string/printf.h>
+#include <iomanip>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -39,12 +41,22 @@ namespace platform {
 struct EnforceNotMet : public std::exception {
  std::exception_ptr exp_;
  std::string err_str_;
-
  EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) {
+    static constexpr int TRACE_STACK_LIMIT = 100;
    try {
      std::rethrow_exception(exp_);
    } catch (const std::exception& exp) {
-      err_str_ = string::Sprintf("%s at [%s:%d]", exp.what(), f, l);
+      std::ostringstream sout;
+      sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
+      sout << "Call Stacks: " << std::endl;
+      void* call_stack[TRACE_STACK_LIMIT];
+      int sz = backtrace(call_stack, TRACE_STACK_LIMIT);
+      auto line = backtrace_symbols(call_stack, sz);
+      for (int i = 0; i < sz; ++i) {
+        sout << line[i] << std::endl;
+      }
+      free(line);
+      err_str_ = sout.str();
    }
  }


--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
-cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python
-        add_op fc_op sgd_op cross_entropy_op recurrent_network_op)
+cc_library(paddle_pybind SHARED
+    SRCS pybind.cc
+    DEPS pybind python
+	fc_op
+	sgd_op
+	add_op
+	mean_op
+	cross_entropy_op
+	recurrent_network_op)
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -33,6 +33,7 @@ USE_OP(onehot_cross_entropy);
 USE_OP_WITHOUT_KERNEL(fc);
 USE_OP(sgd);
 USE_OP(mul);
+USE_OP(mean);
 USE_OP(sigmoid);
 USE_OP(softmax);
 USE_OP(rowwise_add);
@@ -102,15 +103,18 @@ All parameter, weight, gradient are variables in Paddle.
           },
           py::return_value_policy::reference);

-  py::class_<pd::Scope, std::shared_ptr<pd::Scope>>(m, "Scope")
-      .def(py::init<const std::shared_ptr<pd::Scope>&>())
-      .def("get_var",
-           &pd::Scope::GetVariable,
+  py::class_<pd::Scope>(m, "Scope", "")
+      .def("new_var",
+           [](pd::Scope& self, const std::string& name) -> pd::Variable* {
+             return self.NewVar(name);
+           },
           py::return_value_policy::reference)
-      .def("create_var",
-           &pd::Scope::CreateVariable,
+      .def("find_var", &pd::Scope::FindVar, py::return_value_policy::reference)
+      .def(py::init<>())
+      .def("new_scope",
+           [](pd::Scope& self) -> pd::Scope* { return &self.NewScope(); },
           py::return_value_policy::reference)
-      .def("get_var_name", &pd::Scope::GetVariableName);
+      .def("drop_kids", &pd::Scope::DropKids);

  //! @note: Be careful! PyBind will return std::string as an unicode, not
  //! Python str. If you want a str object, you should cast them in Python.

--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -198,6 +198,11 @@ message RowConvConfig {
  required uint32 context_length = 1;
 }

+message SliceConfig {
+  required uint32 start = 1;
+  required uint32 end = 2;
+}
+
 message ProjectionConfig {
  required string type = 1;
  required string name = 2;
@@ -218,6 +223,10 @@ message ProjectionConfig {

  // For pool
  optional PoolConfig pool_conf = 12;
+
+  // For slice
+  // Each slice output is the input[start, end)
+  repeated SliceConfig slices = 13;
 }

 message OperatorConfig {

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -565,6 +565,35 @@ class IdentityOffsetProjection(Projection):
        return []


+@config_class
+class SliceProjection(Projection):
+    type = 'slice'
+
+    def __init__(self, input_layer_name, slices, **xargs):
+        super(SliceProjection, self).__init__(input_layer_name, **xargs)
+        input = g_layer_map[input_layer_name]
+        if input.type in ["exconv", "cudnn_conv"]:
+            # the slice operator is for the channel dimension
+            assert input.num_filters is not None
+            channels = input.num_filters
+            image_size = input.size / channels
+            assert slices[len(slices) - 1][1] <= channels
+            for i in xrange(len(slices)):
+                slice = self.proj_conf.slices.add()
+                slice.start = slices[i][0] * image_size
+                slice.end = slices[i][1] * image_size
+                self.size += slice.end - slice.start
+        else:
+            config_assert(False,
+                          'Currently the input should be convolution layer')
+
+    def calc_parameter_size(self, input_size, output_size):
+        return 0
+
+    def calc_parameter_dims(self, input_size, output_size):
+        return []
+
+
 # DotMulProjection performs element-wise multiplication with weight
 @config_class
 class DotMulProjection(Projection):

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -129,6 +129,7 @@ __all__ = [
    'gated_unit_layer',
    'crop_layer',
    'clip_layer',
+    'slice_projection',
 ]


@@ -538,6 +539,45 @@ def identity_projection(input, offset=None, size=None):
    return proj


+def slice_projection(input, slices):
+    """
+    slice_projection can slice the input value into multiple parts,
+    and then select some of them to merge into a new output.
+
+    .. math::
+       output = [input.slices()]
+
+    The example usage is:
+
+    .. code-block:: python
+
+       proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)])
+
+    Note that slice_projection should not have any parameter.
+
+    :param input: Input Layer.
+    :type input: LayerOutput
+    :param slices: An array of slice parameters.
+                   Each slice contains the start and end offsets based
+                   on the input.
+    :type slices: pair of int
+    :return: A SliceProjection object
+    :rtype: SliceProjection
+    """
+    assert len(slices) >= 1
+    start = 0
+    for i in xrange(len(slices)):
+        assert len(slices[i]) == 2
+        # The start position of the next slice needs to be greater than
+        # or equal to the end position of the previous slice.
+        assert slices[i][0] >= start
+        assert slices[i][1] >= slices[i][0]
+        start = slices[i][1]
+    proj = SliceProjection(input_layer_name=input.name, slices=slices)
+    proj.origin = input
+    return proj
+
+
 @wrap_param_attr_default()
 def scaling_projection(input, param_attr=None):
    """

--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/framework/default_scope_funcs.py
@@ -5,7 +5,7 @@ Default scope function.
 thread-local stack of Scope. Top of that stack is current scope, the bottom 
 of that stack is all scopes' parent. 

-Invoking `create_var/get_var`  can `create/get` variable in current scope. 
+Invoking `new_var/find_var`  can `new/find` variable in current scope. 
 Invoking `enter_local_scope/leave_local_scope` can create or destroy local 
 scope. 

@@ -19,8 +19,8 @@ import threading
 __tl_scope__ = threading.local()

 __all__ = [
-    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'create_var',
-    'get_var', 'scoped_function'
+    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'new_var',
+    'find_var', 'scoped_function'
 ]


@@ -33,7 +33,7 @@ def get_cur_scope():
    if cur_scope_stack is None:
        __tl_scope__.cur_scope = list()
    if len(__tl_scope__.cur_scope) == 0:
-        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope(None))
+        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope())
    return __tl_scope__.cur_scope[-1]


@@ -42,7 +42,7 @@ def enter_local_scope():
    Enter a new local scope
    """
    cur_scope = get_cur_scope()
-    new_scope = paddle.v2.framework.core.Scope(cur_scope)
+    new_scope = cur_scope.new_scope()
    __tl_scope__.cur_scope.append(new_scope)


@@ -51,20 +51,21 @@ def leave_local_scope():
    Leave local scope
    """
    __tl_scope__.cur_scope.pop()
+    get_cur_scope().drop_kids()


-def create_var(name):
+def new_var(name):
    """
    create variable in current scope.
    """
-    return get_cur_scope().create_var(name)
+    return get_cur_scope().new_var(name)


-def get_var(name):
+def find_var(name):
    """
    get variable in current scope.
    """
-    return get_cur_scope().get_var(name)
+    return get_cur_scope().find_var(name)


 def scoped_function(func):

--- a/python/paddle/v2/framework/network.py
+++ b/python/paddle/v2/framework/network.py
 import paddle.v2.framework.core as core
 from paddle.v2.framework.create_op_creation_methods import op_creations
-from default_scope_funcs import create_var, get_var, get_cur_scope
+from default_scope_funcs import new_var, find_var, get_cur_scope

 __all__ = ['Network']  # Only expose Network

@@ -29,12 +29,15 @@ class NetworkFunctor(object):
            if ipt in kwargs:
                var = kwargs[ipt]
                if isinstance(var, basestring):
-                    var = create_var(var)
+                    tmp = new_var(var)
+                    self.net.var_names[tmp] = var
+                    var = tmp
+
                if not isinstance(var, core.Variable):
                    raise TypeError(
                        "Input of op creation must be string or variable")

-                kwargs[ipt] = get_cur_scope().get_var_name(var)
+                kwargs[ipt] = self.net.var_names[var]

        notemp_outputs = self.func.all_not_temp_output_args

@@ -49,17 +52,20 @@ class NetworkFunctor(object):
            if opt in kwargs:
                var = kwargs[opt]
                if isinstance(var, basestring):
-                    var = create_var(var)
+                    tmp = new_var(var)
+                    self.net.var_names[tmp] = var
+                    var = tmp
+
                if not isinstance(var, core.Variable):
                    raise TypeError(
                        "Output of op creation must be string or variable")
-                kwargs[opt] = get_cur_scope().get_var_name(var)
+                kwargs[opt] = self.net.var_names[var]

        op = self.func(**kwargs)

        self.net.net.add_op(op)

-        lst = [get_var(kwargs[opt]) for opt in notemp_outputs]
+        lst = [find_var(kwargs[opt]) for opt in notemp_outputs]
        if len(lst) == 1:
            return lst[0]
        elif len(lst) == 0:
@@ -89,6 +95,7 @@ class Network(object):
        self.net = core.Net.create()
        funcs = (func_name for func_name in dir(op_creations)
                 if not func_name.startswith("__"))
+        self.var_names = dict()

        # TODO(yuyang18): This code can work, but do not generate a good
        # docstring, try to give a better way generate function in runtime

--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -10,6 +10,7 @@ add_python_test(test_framework
    test_sgd_op.py
    test_cross_entropy_op.py
    test_mul_op.py
+    test_mean_op.py
    test_sigmoid_op.py
    test_softmax_op.py
    test_rowwise_add_op.py

--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -24,13 +24,13 @@ class OpTestMeta(type):
            func = getattr(creation.op_creations, self.type, None)
            self.assertIsNotNone(func)

-            scope = core.Scope(None)
+            scope = core.Scope()
            kwargs = dict()

            for in_name in func.all_input_args:
                if hasattr(self, in_name):
                    kwargs[in_name] = in_name
-                    var = scope.create_var(in_name).get_tensor()
+                    var = scope.new_var(in_name).get_tensor()
                    arr = getattr(self, in_name)
                    var.set_dims(arr.shape)
                    var.set(arr)
@@ -40,7 +40,7 @@ class OpTestMeta(type):
            for out_name in func.all_output_args:
                if hasattr(self, out_name):
                    kwargs[out_name] = out_name
-                    scope.create_var(out_name).get_tensor()
+                    scope.new_var(out_name).get_tensor()

            for attr_name in func.all_attr_args:
                if hasattr(self, attr_name):
@@ -54,7 +54,7 @@ class OpTestMeta(type):
            op.run(scope, ctx)

            for out_name in func.all_output_args:
-                actual = numpy.array(scope.get_var(out_name).get_tensor())
+                actual = numpy.array(scope.find_var(out_name).get_tensor())
                expect = getattr(self, out_name)
                # TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul
                # has some diff, and could not pass unittest. So I set decimal 3 here.

--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
@@ -7,19 +7,19 @@ class TestDefaultScopeFuncs(unittest.TestCase):
        self.assertIsNotNone(get_cur_scope())

    def test_none_variable(self):
-        self.assertIsNone(get_var("test"))
+        self.assertIsNone(find_var("test"))

    def test_create_var_get_var(self):
-        var_a = create_var("var_a")
+        var_a = new_var("var_a")
        self.assertIsNotNone(var_a)
-        self.assertIsNotNone(get_cur_scope().get_var('var_a'))
+        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
        enter_local_scope()
-        self.assertIsNotNone(get_cur_scope().get_var('var_a'))
+        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
        leave_local_scope()

    def test_var_get_int(self):
        def __new_scope__():
-            i = create_var("var_i")
+            i = new_var("var_i")
            self.assertFalse(i.is_int())
            i.set_int(10)
            self.assertTrue(i.is_int())

--- a/python/paddle/v2/framework/tests/test_fc_op.py
+++ b/python/paddle/v2/framework/tests/test_fc_op.py
@@ -6,13 +6,13 @@ import paddle.v2.framework.create_op_creation_methods as creation

 class TestFc(unittest.TestCase):
    def test_fc(self):
-        scope = core.Scope(None)
-        x = scope.create_var("X")
+        scope = core.Scope()
+        x = scope.new_var("X")
        x_tensor = x.get_tensor()
        x_tensor.set_dims([1000, 784])
        x_tensor.alloc_float()

-        w = scope.create_var("W")
+        w = scope.new_var("W")
        w_tensor = w.get_tensor()
        w_tensor.set_dims([784, 100])
        w_tensor.alloc_float()
@@ -25,10 +25,10 @@ class TestFc(unittest.TestCase):
        op = creation.op_creations.fc(X="X", Y="Y", W="W")

        for out in op.outputs():
-            if scope.get_var(out) is None:
-                scope.create_var(out).get_tensor()
+            if scope.find_var(out) is None:
+                scope.new_var(out).get_tensor()

-        tensor = scope.get_var("Y").get_tensor()
+        tensor = scope.find_var("Y").get_tensor()
        op.infer_shape(scope)
        self.assertEqual([1000, 100], tensor.shape())


--- a/python/paddle/v2/framework/tests/test_mean_op.py
+++ b/python/paddle/v2/framework/tests/test_mean_op.py
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+class TestMeanOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "mean"
+        self.X = np.random.random((32, 784)).astype("float32")
+        self.Out = np.mean(self.X)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
@@ -5,29 +5,29 @@ import unittest
 class TestScope(unittest.TestCase):
    def test_create_destroy(self):
        paddle_c = paddle.v2.framework.core
-        scope = paddle_c.Scope(None)
+        scope = paddle_c.Scope()
        self.assertIsNotNone(scope)
-        scope_with_parent = paddle_c.Scope(scope)
+        scope_with_parent = scope.new_scope()
        self.assertIsNotNone(scope_with_parent)

    def test_none_variable(self):
        paddle_c = paddle.v2.framework.core
-        scope = paddle_c.Scope(None)
-        self.assertIsNone(scope.get_var("test"))
+        scope = paddle_c.Scope()
+        self.assertIsNone(scope.find_var("test"))

    def test_create_var_get_var(self):
        paddle_c = paddle.v2.framework.core
-        scope = paddle_c.Scope(None)
-        var_a = scope.create_var("var_a")
+        scope = paddle_c.Scope()
+        var_a = scope.new_var("var_a")
        self.assertIsNotNone(var_a)
-        self.assertIsNotNone(scope.get_var('var_a'))
-        scope2 = paddle_c.Scope(scope)
-        self.assertIsNotNone(scope2.get_var('var_a'))
+        self.assertIsNotNone(scope.find_var('var_a'))
+        scope2 = scope.new_scope()
+        self.assertIsNotNone(scope2.find_var('var_a'))

    def test_var_get_int(self):
        paddle_c = paddle.v2.framework.core
-        scope = paddle_c.Scope(None)
-        var = scope.create_var("test_int")
+        scope = paddle_c.Scope()
+        var = scope.new_var("test_int")
        var.set_int(10)
        self.assertTrue(var.is_int())
        self.assertEqual(10, var.get_int())

--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
@@ -5,8 +5,8 @@ import numpy

 class TestScope(unittest.TestCase):
    def test_int_tensor(self):
-        scope = core.Scope(None)
-        var = scope.create_var("test_tensor")
+        scope = core.Scope()
+        var = scope.new_var("test_tensor")
        tensor = var.get_tensor()

        tensor.set_dims([1000, 784])
@@ -23,8 +23,8 @@ class TestScope(unittest.TestCase):
        self.assertEqual(2.0, tensor_array_2[19, 11])

    def test_float_tensor(self):
-        scope = core.Scope(None)
-        var = scope.create_var("test_tensor")
+        scope = core.Scope()
+        var = scope.new_var("test_tensor")
        tensor = var.get_tensor()

        tensor.set_dims([1000, 784])

--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -76,3 +76,6 @@ class client(object):
        # Memory created from C should be freed.
        get_c_lib().mem_free(ret.contents)
        return record, 0
+
+    def paddle_start_get_records(self, pass_id):
+        get_c_lib().paddle_start_get_records(self.c, pass_id)
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could
 be used in user program.
 """

-__all__ = ['np_array', 'text_file', "recordio"]
+__all__ = ['np_array', 'text_file', "cloud_reader"]


 def np_array(x):
@@ -81,35 +81,41 @@ def recordio_local(paths, buf_size=100):
    return dec.buffered(reader, buf_size)


-def recordio(paths, buf_size=100):
+pass_num = 0
+
+
+def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
    """
-    Creates a data reader that outputs record one one by one
-        from given local or cloud recordio path.
+    Create a data reader that yield a record one by one from
+        the paths:
    :path: path of recordio files.
+    :etcd_endpoints: the endpoints for etcd cluster
    :returns: data reader of recordio files.
+
+    ..  code-block:: python
+        from paddle.v2.reader.creator import cloud_reader
+        etcd_endpoints = "http://127.0.0.1:2379"
+        trainer.train.(
+            reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints),
+        )
    """
    import os
-    import paddle.v2.master.client as cloud
-
-    if "KUBERNETES_SERVICE_HOST" not in os.environ.keys():
-        return recordio_local(paths)
-
-    host_name = "MASTER_SERVICE_HOST"
-    if host_name not in os.environ.keys():
-        raise Exception('not find ' + host_name + ' in environment variable.')
-
-    addr = os.environ(host)
+    import cPickle as pickle
+    import paddle.v2.master as master
+    c = master.client(etcd_endpoints, timeout_sec, buf_size)
+    c.set_dataset(paths)

    def reader():
-        c = cloud(addr, buf_size)
-        c.set_dataset(paths)
+        global pass_num
+        c.paddle_start_get_records(pass_num)
+        pass_num += 1

        while True:
-            r, err = client.next_record()
-            if err < 0:
+            r, e = c.next_record()
+            if not r:
+                if e != -2:
+                    print "get record error: ", e
                break
-            yield r
-
-        c.release()
+            yield pickle.loads(r)

    return reader
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -34,14 +34,5 @@ class TestTextFile(unittest.TestCase):
            self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))


-class TestRecordIO(unittest.TestCase):
-    def test_recordio(self):
-        path = os.path.join(
-            os.path.dirname(__file__), "test_recordio_creator.dat")
-        reader = paddle.v2.reader.creator.recordio([path])
-        for idx, r in enumerate(reader()):
-            self.assertSequenceEqual(r, str(idx))
-
-
 if __name__ == '__main__':
    unittest.main()