Merge remote-tracking branch 'origin/develop' into multigpu

d4d215a5 · Dong Zhihong · 5bcb6380 · 6729f32c · d4d215a5 · d4d215a5
46 changed file
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -243,7 +243,7 @@ class SymbolTable {
  // TODO determine whether name is generated by python or C++.
  // Currently assume that a unique name will be generated by C++ if the
  // argument name is left default.
-  VarDesc* NewVar(const string& name="");
+  VarDesc* Var(const string& name="");

  // find a VarDesc by name, if recursive is true, find parent's SymbolTable
  // recursively.

--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -37,7 +37,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 ```cpp
 class Scope {
 public:
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);
  const Variable* FindVar(const std::string& name) const;

 private:
@@ -98,7 +98,7 @@ class Scope {
  Variable* FindVar(const std::string& name) const;

  // return if already contains same name variable.
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);

 private:
  std::shared_ptr<Scope> parent_;
@@ -107,7 +107,7 @@ class Scope {
 ```
 ## Only scope can create a variable

-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`.
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.

 ## When scope destroyed, all variables inside this scope should be destroyed together

@@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar

 ## Orthogonal interface

-`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
--- a/doc/design/tensor_array.md
+++ b/doc/design/tensor_array.md
@@ -161,7 +161,7 @@ class TensorArray:
        @name: str
            the name of the variable to output.
        '''
-        tensor = NewVar(name)
+        tensor = Var(name)
        tensor_array_stack(self.name, tensor)
        return tensor


--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -273,18 +273,41 @@ static bool AllGradInSet(const std::vector<std::string>& names,
  return true;
 }

-static void CreateGradVarInBlock(BlockDescBind* block_desc,
-                                 size_t grad_op_start_index) {
+static void CreateGradVarInBlock(
+    size_t grad_op_start_index,
+    const std::unordered_map<std::string, std::string>& param_name_map,
+    BlockDescBind* block_desc,
+    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
  auto ops = block_desc->AllOps();
  for (size_t op_index = grad_op_start_index; op_index < ops.size();
       ++op_index) {
-    for (const auto& output : ops[op_index]->Outputs()) {
-      for (const auto& real_output : output.second) {
-        if (!block_desc->HasVar(real_output)) {
-          block_desc->NewVar(real_output);
-        }
-      }
-    }
+    // <<<<<<< HEAD
+    //     for (const auto& output : ops[op_index]->Outputs()) {
+    //       for (const auto& real_output : output.second) {
+    //         if (!block_desc->HasVar(real_output)) {
+    //           block_desc->Var(real_output);
+    //         }
+    //       }
+    //     }
+    // =======
+    ForEachVarName(ops[op_index]->Outputs(),
+                   [&](const std::string& grad_var_name) {
+                     if (block_desc->HasVar(grad_var_name)) {
+                       return false;
+                     }
+                     block_desc->Var(grad_var_name);
+                     auto it = param_name_map.find(grad_var_name);
+                     if (it == param_name_map.end()) {
+                       return false;
+                     }
+                     auto param_var_name = it->second;
+                     auto& grad_record = (*grad_var_record)[param_var_name];
+                     grad_record.name_ = grad_var_name;
+                     grad_record.block_idx_ = block_desc->ID();
+                     grad_record.op_idx_ = static_cast<int>(op_index);
+                     return false; /* not break */
+                   });
+    // >>>>>>> origin/develop
  }
 }

@@ -400,8 +423,9 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
  return backward_descs;
 }

-void AppendBackward(ProgramDescBind& program_desc, const VarDescBind& target,
-                    const std::unordered_set<std::string>& no_grad_vars) {
+ParamGradInfoMap AppendBackward(
+    ProgramDescBind& program_desc, const VarDescBind& target,
+    const std::unordered_set<std::string>& no_grad_vars) {
  std::unordered_set<std::string> no_grad_var_names;
  no_grad_var_names.reserve(no_grad_vars.size() + 1);
  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
@@ -423,20 +447,28 @@ void AppendBackward(ProgramDescBind& program_desc, const VarDescBind& target,
  all_ops.push_back(std::move(fill_one_op));
  size_t forward_op_num = all_ops.size();
  size_t forward_block_num = program_desc.Size();
+
+  // Insert backward operators
  std::unordered_map<std::string, std::string> grad_to_var;
  auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx,
                                             &no_grad_var_names, &grad_to_var);
+
+  std::unordered_map<std::string, GradVarInfo> retv;
+
+  // Create Variable
  for (auto& ptr : backward_op_descs) {
    all_ops.push_back(std::move(ptr));
  }
-  root_block->NewVar(fill_one_op_out);
+  root_block->Var(fill_one_op_out);

  // create grad_var for all blocks in this program
-  CreateGradVarInBlock(root_block, forward_op_num);
+  CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv);
  for (size_t block_index = forward_block_num;
       block_index < program_desc.Size(); ++block_index) {
-    CreateGradVarInBlock(program_desc.Block(block_index), 0);
+    CreateGradVarInBlock(0, grad_to_var, program_desc.Block(block_index),
+                         &retv);
  }
+  return retv;
 }

 }  // namespace framework

--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@@ -14,7 +14,10 @@

 #pragma once

+#include <string>
+#include <unordered_map>
 #include <unordered_set>
+
 #include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"

@@ -27,10 +30,18 @@ extern std::unique_ptr<OperatorBase> Backward(
    const OperatorBase& forwardOp,
    const std::unordered_set<std::string>& no_grad_vars);

-// TODO(jiayi): Add target as parameter and generate backward op
-// according to target.
-void AppendBackward(ProgramDescBind& program_desc, const VarDescBind& target,
-                    const std::unordered_set<std::string>& no_grad_vars);
+struct GradVarInfo {
+  std::string name_;
+  int block_idx_;
+  int op_idx_;
+};
+
+using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
+                                            GradVarInfo /*grad_var_info*/>;
+
+ParamGradInfoMap AppendBackward(
+    ProgramDescBind& program_desc, const VarDescBind& target,
+    const std::unordered_set<std::string>& no_grad_vars);

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -18,19 +18,22 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-VarDescBind *BlockDescBind::NewVar(const std::string &name) {
+VarDescBind *BlockDescBind::Var(const std::string &name) {
  need_update_ = true;
  auto it = vars_.find(name);
-  PADDLE_ENFORCE(it == vars_.end(), "Duplicated variable %s", name);
-  auto var = new VarDescBind(name);
+  if (it != vars_.end()) {
+    return it->second.get();
+  }
+  auto *var = new VarDescBind(name);
  vars_[name].reset(var);
  return var;
 }

-VarDescBind *BlockDescBind::Var(const std::string &name) const {
+VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
  auto it = vars_.find(name);
-  PADDLE_ENFORCE(it != vars_.end(),
-                 "Can not find variable %s in current block.", name);
+  if (it == vars_.end()) {
+    return nullptr;
+  }
  return it->second.get();
 }


--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -33,15 +33,6 @@ class ProgramDescBind;

 class BlockDescBind {
 public:
-  friend std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
-      ProgramDescBind &program_desc, int block_idx,
-      std::unordered_set<std::string> *no_grad_vars,
-      std::unordered_map<std::string, std::string> *grad_to_var);
-
-  friend void AppendBackward(
-      ProgramDescBind &program_desc, const VarDescBind &target,
-      const std::unordered_set<std::string> &no_grad_vars);
-
  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
      : prog_(prog), desc_(desc), need_update_(false) {}

@@ -49,9 +40,9 @@ class BlockDescBind {

  int32_t Parent() const { return desc_->parent_idx(); }

-  VarDescBind *NewVar(const std::string &name_bytes);
+  VarDescBind *Var(const std::string &name_bytes);

-  VarDescBind *Var(const std::string &name_bytes) const;
+  VarDescBind *FindVar(const std::string &name_bytes) const;

  bool HasVar(const std::string &var_name) const;

@@ -69,7 +60,9 @@ class BlockDescBind {

  BlockDesc *Proto();

- private:
+  // FIXME(yuyang18): backward will access private data of BlockDesc.
+  // Mark it public temporary. We can fix it later.
+ public:
  ProgramDescBind *prog_;  // not_own
  BlockDesc *desc_;        // not_own
  bool need_update_;

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -66,7 +66,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {

  // Instantiate all the vars in the global scope
  for (auto& var : block.vars()) {
-    scope->NewVar(var.name());
+    scope->Var(var.name());
  }

  Scope& local_scope = scope->NewScope();
@@ -78,7 +78,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
      for (auto& var : block.ops(i).outputs()) {
        for (auto& argu : var.arguments()) {
          if (local_scope.FindVar(argu) == nullptr) {
-            local_scope.NewVar(argu);
+            local_scope.Var(argu);
          }
        }
      }

--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -46,10 +46,16 @@ void AddOp(const std::string& type, const VariableNameMap& inputs,
  // insert output
  for (auto kv : outputs) {
    for (auto v : kv.second) {
+      // <<<<<<< HEAD
+      //       auto var = block->Var(v);
+      //       var->SetType(VarDesc::LOD_TENSOR);
+      //       var->SetDataType(paddle::framework::DataType::FP32);
+      // =======
      if (!block->HasVar(v)) {
-        auto var = block->NewVar(v);
+        auto var = block->Var(v);
        var->SetDataType(paddle::framework::DataType::FP32);
      }
+      // >>>>>>> origin/develop
    }
  }


--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -403,11 +403,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {

 private:
  DDim GetDim(const std::string& name) const override {
-    return framework::make_ddim(block_.Var(name)->Shape());
+    return framework::make_ddim(block_.FindVar(name)->Shape());
  }

  void SetDim(const std::string& name, const DDim& dim) override {
-    block_.Var(name)->SetShape(framework::vectorize(dim));
+    block_.FindVar(name)->SetShape(framework::vectorize(dim));
  }

  const OpDescBind& op_;

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -84,7 +84,7 @@ TEST(OperatorBase, all) {
  paddle::framework::Scope scope;

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  scope.NewVar("OUT1");
+  scope.Var("OUT1");
  ASSERT_EQ(paddle::framework::op_run_num, 0);
  op->Run(scope, device_context);
  ASSERT_EQ(paddle::framework::op_run_num, 1);
@@ -237,12 +237,12 @@ TEST(OpKernel, multi_inputs) {

  paddle::platform::CPUDeviceContext cpu_device_context;
  paddle::framework::Scope scope;
-  scope.NewVar("x0")->GetMutable<Tensor>();
-  scope.NewVar("x1")->GetMutable<Tensor>();
-  scope.NewVar("x2")->GetMutable<Tensor>();
-  scope.NewVar("k0")->GetMutable<Tensor>();
-  scope.NewVar("y0")->GetMutable<Tensor>();
-  scope.NewVar("y1")->GetMutable<Tensor>();
+  scope.Var("x0")->GetMutable<Tensor>();
+  scope.Var("x1")->GetMutable<Tensor>();
+  scope.Var("x2")->GetMutable<Tensor>();
+  scope.Var("k0")->GetMutable<Tensor>();
+  scope.Var("y0")->GetMutable<Tensor>();
+  scope.Var("y1")->GetMutable<Tensor>();

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  op->Run(scope, cpu_device_context);

--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -31,7 +31,7 @@ Scope& Scope::NewScope() const {
  return *kids_.back();
 }

-Variable* Scope::NewVar(const std::string& name) {
+Variable* Scope::Var(const std::string& name) {
  auto iter = vars_.find(name);
  if (iter != vars_.end()) {
    return iter->second;
@@ -42,8 +42,8 @@ Variable* Scope::NewVar(const std::string& name) {
  return v;
 }

-Variable* Scope::NewVar() {
-  return NewVar(string::Sprintf("%p.%d", this, vars_.size()));
+Variable* Scope::Var() {
+  return Var(string::Sprintf("%p.%d", this, vars_.size()));
 }

 Variable* Scope::FindVar(const std::string& name) const {
@@ -71,8 +71,8 @@ framework::Scope& GetGlobalScope() {
  static std::unique_ptr<framework::Scope> g_scope{nullptr};
  std::call_once(feed_variable_flag, [&]() {
    g_scope.reset(new framework::Scope());
-    g_scope->NewVar("feed_value");
-    g_scope->NewVar("fetch_value");
+    g_scope->Var("feed_value");
+    g_scope->Var("fetch_value");
  });
  return *(g_scope.get());
 }

--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -45,10 +45,10 @@ class Scope {
  Scope& NewScope() const;

  /// Create a variable with given name if it doesn't exist.
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);

  /// Create a variable with a scope-unique name.
-  Variable* NewVar();
+  Variable* Var();

  /// Find a variable in the scope or any of its ancestors.  Returns
  /// nullptr if cannot find.

--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -23,8 +23,8 @@ TEST(Scope, VarsShadowing) {
  Scope& ss1 = s.NewScope();
  Scope& ss2 = s.NewScope();

-  Variable* v0 = s.NewVar("a");
-  Variable* v1 = ss1.NewVar("a");
+  Variable* v0 = s.Var("a");
+  Variable* v1 = ss1.Var("a");

  EXPECT_NE(v0, v1);

@@ -40,7 +40,7 @@ TEST(Scope, FindVar) {
  EXPECT_EQ(nullptr, s.FindVar("a"));
  EXPECT_EQ(nullptr, ss.FindVar("a"));

-  ss.NewVar("a");
+  ss.Var("a");

  EXPECT_EQ(nullptr, s.FindVar("a"));
  EXPECT_NE(nullptr, ss.FindVar("a"));
@@ -49,7 +49,7 @@ TEST(Scope, FindVar) {
 TEST(Scope, FindScope) {
  Scope s;
  Scope& ss = s.NewScope();
-  Variable* v = s.NewVar("a");
+  Variable* v = s.Var("a");

  EXPECT_EQ(&s, s.FindScope(v));
  EXPECT_EQ(&s, ss.FindScope(v));

--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -86,6 +86,7 @@ protected:
  /// Also used in 'use_mkldnn' case.
  std::vector<Argument> outputOtherDevice_;
  /// If there are several outputs, map them by each name.
+  /// MKLDNNLayer use it only to merge output grad
  std::map<std::string, Argument*> outputMap_;
  /// Used to merge grad on different devices.
  MatrixPtr tmpGrad_;
@@ -325,6 +326,11 @@ public:
    outputMap_[name] = output;
  }

+  /**
+   * Get the output map size, if layer has multi-output.
+   */
+  size_t getOutputMapSize() { return outputMap_.size(); }
+
  /**
   * Get the output based on layer's name.
   */

--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -225,8 +225,6 @@ void MKLDNNConvLayer::resetFwdPipeline(
    MKLDNNMatrixPtr& wgt,
    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
  if (cvtInVal_) {
    pipeline.push_back(*cvtInVal_);
  }
@@ -245,7 +243,7 @@ void MKLDNNConvLayer::resetFwdPipeline(

 void MKLDNNConvLayer::resetInValue(
    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& in) {
-  const MatrixPtr& inMat = inputLayers_[0]->getOutput().value;
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
  in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc());

  // create buffer and reorder if input value do not match
@@ -310,15 +308,20 @@ void MKLDNNConvLayer::resetOutValue(
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
    memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
-    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+    if (cpuOutVal_->getPrimitiveDesc() != pd->dst_primitive_desc()) {
+      out = MKLDNNMatrix::create(nullptr, pd->dst_primitive_desc());
      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
-      CHECK(cvtOutVal_) << "should not be emptry";
+      CHECK(cvtOutVal_) << "should not be empty";
    } else {
-      // CPU output share the same data of MKLDNN output
-      cpuOut->setData(out->getData());
      cpuOutVal_ = out;
    }
+    // when output is cpu device, change the mkldnn output value and make them
+    // share the same data. Then if next layer use inputlayer->getOuputValue()
+    // to achieve the input value, it will get the right data.
+    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
+    return;
  }
+  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }

 void MKLDNNConvLayer::resetBwdWgtPD(
@@ -412,8 +415,6 @@ void MKLDNNConvLayer::resetBwdPipeline(
    MKLDNNMatrixPtr& wgt,
    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
  if (cvtOutGrad_) {
    pipeline.push_back(*cvtOutGrad_);
  }
@@ -446,28 +447,27 @@ void MKLDNNConvLayer::resetBwdPipeline(

 void MKLDNNConvLayer::resetOutGrad(
    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
-  const MatrixPtr& outMat = output_.grad;
-  out = MKLDNNMatrix::create(outMat, wgtPD->diff_dst_primitive_desc());
-  CHECK(outVal_ != nullptr &&
-        out->getPrimitiveDesc() == outVal_->getPrimitiveDesc())
-      << "primitive desc of out grad and value should be equal";
-
-  // TODO(TJ): merge outgrad
-  // create reorder if has output grad does not match
  cpuOutGrad_ = nullptr;
  cvtOutGrad_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
+  CHECK(outVal_ != nullptr &&
+        outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc())
+      << "primitive desc of out grad and value should be equal";
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    outMat->setData(cpuOut->getData());
    // same PrimitiveDesc with cpuInVal_
    CHECK(cpuOutVal_);
    cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
-    if (cpuOutGrad_->getPrimitiveDesc() == out->getPrimitiveDesc()) {
-      out = cpuOutGrad_;
-    } else {
-      out = MKLDNNMatrix::create(nullptr, wgtPD->diff_dst_primitive_desc());
+    // create reorder if primitive desc does not match
+    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
      CHECK(cvtOutGrad_);
+    } else {
+      // share the same data of CPU output
+      output_.grad->setData(cpuOut->getData());
+      out = cpuOutGrad_;
    }
  }
 }
@@ -496,32 +496,30 @@ void MKLDNNConvLayer::resetWgtBiasGrad(
 void MKLDNNConvLayer::resetInGrad(
    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
    MKLDNNMatrixPtr& in) {
+  in = nullptr;
+  cpuInGrad_ = nullptr;
+  cvtInGrad_ = nullptr;
  if (dataPD == nullptr) {
    return;
  }

-  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
-  in = MKLDNNMatrix::create(inputLayers_[0]->getOutput().grad,
-                            dataPD->diff_src_primitive_desc());
-  CHECK(nullptr != inVal_ &&
-        in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
-      << "primitive desc of input grad and value should be equal";
-
-  // create reorder if has output grad does not match
-  cpuInGrad_ = nullptr;
-  cvtInGrad_ = nullptr;
-  if (!inputIsOnlyMKLDNN()) {
+  if (inputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc());
+    CHECK(nullptr != inVal_ &&
+          in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
+        << "primitive desc of input grad and value should be equal";
+  } else {
    const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
    // same PrimitiveDesc with cpuInVal_
    CHECK(cpuInVal_);
    cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
-    if (cpuInGrad_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
-      const MatrixPtr& dnnIn = getInputGrad(0, MKLDNN_DEVICE);
-      in = MKLDNNMatrix::create(dnnIn, in->getPrimitiveDesc());
+    in = cpuInGrad_;
+    // create reorder if PrimitiveDesc does not match
+    if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) {
+      in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE),
+                                dataPD->diff_src_primitive_desc());
      cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
      CHECK(cvtInGrad_);
-    } else {
-      in = cpuInGrad_;
    }
  }
 }

--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -180,10 +180,10 @@ void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
 void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
  out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
  if (!outputIsOnlyMKLDNN()) {
-    // fc cpu output value do not need create convert
-    // just share point
+    // fc cpu output value do not need create convert, just share data
    getOutput(CPU_DEVICE).value->setData(out->getData());
  }
+  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }

 void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
@@ -214,8 +214,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
    MKLDNNMatrixPtr& wgt,
    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
  if (bias) {
    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
  } else {
@@ -237,19 +235,14 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
 }

 void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  // TODO(TJ): merge outgrad
-  int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
-  output_.grad->setData(getOutput(device).grad->getData());
-  // for MKLDNN device:
-  // can not directly cast outputgrad to mkldnnmatrix,
-  // since each layer can not write the inputgrad to mkldnn inputgrad.
-  // So just create from matrix with outputvalue format.
-  // for CPU device:
-  // fc do not need to convert from cpu device since output is always nc format
-  // only need create from cpu device
  CHECK(outVal_);
-  out =
-      MKLDNNMatrix::create(getOutput(device).grad, outVal_->getPrimitiveDesc());
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
+    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    output_.grad->setData(cpuOut->getData());
+    out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
+  }
 }

 void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
@@ -267,13 +260,11 @@ void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,

 void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
  in = nullptr;
-  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
-  if (inGrad == nullptr) {
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
    return;
  }
-  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
  CHECK(inVal_);
-  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }

 void MKLDNNFcLayer::resetBwdWgtPD(
@@ -314,7 +305,6 @@ void MKLDNNFcLayer::resetBwdPipeline(
    MKLDNNMatrixPtr& wgt,
    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
-  pipeline.clear();
  CHECK(inVal_);
  if (bias) {
    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias));

--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -65,6 +65,17 @@ protected:
  MKLDNNMatrixPtr biasVal_;
  MKLDNNMatrixPtr biasGrad_;

+  // merge grad primitive
+  std::shared_ptr<mkldnn::primitive> mergeGrad_;
+  std::vector<mkldnn::primitive> pipelineMergeGrad_;
+  // tmp input argument to save input grad, only used to merge grad
+  Argument tmpInArg_;
+  // since mkldnn sum do not support different formats:
+  // can refer to https://github.com/01org/mkl-dnn/issues/134
+  // so need create reorder manually and save tmp MKLDNNMatrix
+  MKLDNNMatrixPtr tmpOutGrad_;
+  std::shared_ptr<mkldnn::primitive> tmpCvt_;
+
 public:
  explicit MKLDNNLayer(const LayerConfig& config)
      : Layer(config),
@@ -99,6 +110,7 @@ public:
    if (!Layer::init(layerMap, parameterMap)) {
      return false;
    }
+    setOutputMap();
    checkCPUOutputsNumber();

    stream_.reset(new MKLDNNStream());
@@ -118,12 +130,9 @@ public:
        VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
        // reset when input total sizes changed, not only the batchsize
        inputElemenCnt_ = elemenCnt;
+        pipelineFwd_.clear();
        reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
        resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
-        if (outVal_) {
-          // change original output value to mkldnn output value
-          output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
-        }
        convertWeightsFromPaddle();
        needResetBwd_ = true;
      }
@@ -144,9 +153,18 @@ public:
  void backward(const UpdateCallback& callback) override {
    if (needResetBwd_) {
      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+      pipelineBwd_.clear();
+      pipelineMergeGrad_.clear();
+      mergeGrad_ = nullptr;
      resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
      needResetBwd_ = false;
    }
+
+    // merge grad must before backward activation
+    if (mergeGrad_) {
+      REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
+      stream_->submit(pipelineMergeGrad_);
+    }
    {
      REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
      backwardActivation();
@@ -247,6 +265,76 @@ protected:
    }
  }

+  /**
+   * reset the output grad matrix from primitive desc.
+   * and reset the merge grad primitive if needed.
+   * note: when this layer has serval outputs,
+   *       it could not be mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  virtual void resetOutGrad(MKLDNNMatrixPtr& out,
+                            mkldnn::memory::primitive_desc pd) {
+    CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet";
+    mergeGrad_ = nullptr;
+    pipelineMergeGrad_.clear();
+    out = MKLDNNMatrix::create(output_.grad, pd);
+    if (outputMap_.size() <= 1) {
+      return;
+    }
+    std::vector<double> scales(outputMap_.size(), 1.0);
+    std::vector<mkldnn::memory::primitive_desc> srcPDs;
+    std::vector<mkldnn::primitive::at> srcs;
+    for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
+      MKLDNNMatrixPtr src =
+          std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
+      VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
+      CHECK(src) << "should be MKLDNNMatrix";
+      auto srcDims = src->getDims();
+      auto dstDims = out->getDims();
+      CHECK_EQ(srcDims.size(), dstDims.size());
+      for (size_t i = 0; i < srcDims.size(); ++i) {
+        CHECK_EQ(srcDims[i], dstDims[i]);
+      }
+      srcPDs.push_back(src->getPrimitiveDesc());
+      srcs.push_back(*src);
+    }
+
+    // TODO(TJ): remove me when mkldnn sum support different formats
+    for (size_t i = 1; i < srcPDs.size(); ++i) {
+      CHECK(srcPDs[0] == srcPDs[i]);
+    }
+    tmpOutGrad_ = nullptr;
+    tmpCvt_ = nullptr;
+    if (out->getPrimitiveDesc() != srcPDs[0]) {
+      tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]);
+      tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
+      CHECK(tmpCvt_);
+      pipelineMergeGrad_.push_back(*tmpCvt_);
+    } else {
+      tmpOutGrad_ = out;
+    }
+
+    auto sumPD = mkldnn::sum::primitive_desc(
+        tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
+    mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_));
+    pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
+  }
+
+  /**
+   * reset input grad from primitive desc.
+   * this function is avaiable for input is only mkldnn
+   * or input do not care cpu device
+   */
+  virtual void resetInGrad(MKLDNNMatrixPtr& in,
+                           mkldnn::memory::primitive_desc pd) {
+    LayerPtr& input = inputLayers_[0];
+    const MatrixPtr& grad =
+        input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad;
+    in = MKLDNNMatrix::create(grad, pd);
+    Argument& arg = input->getOutput(this->getName());
+    arg.grad = std::dynamic_pointer_cast<Matrix>(in);
+  }
+
  /**
   * print info about sizes
   */
@@ -334,6 +422,16 @@ private:
    }
  }

+  /**
+   * Set output map of prev layers.
+   */
+  void setOutputMap() {
+    outputMap_.clear();
+    for (size_t i = 0; i < inputLayers_.size(); ++i) {
+      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
+    }
+  }
+
  /**
   * Check the cpu device number of outputOtherDevice_.
   * should have only one at most.

--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -142,14 +142,16 @@ void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc());
      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
      CHECK(cvtOutVal_) << "should not be emptry";
    } else {
-      // CPU output share the same data of MKLDNN output
-      cpuOut->setData(out->getData());
      cpuOutVal_ = out;
    }
+    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
+    return;
  }
+  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
 }

 void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
@@ -187,7 +189,6 @@ void MKLDNNPoolLayer::resetFwdPipeline(
    std::shared_ptr<pool_fwd::primitive_desc>& pd,
    MKLDNNMatrixPtr& in,
    MKLDNNMatrixPtr& out) {
-  pipeline.clear();
  fwd_ = workspace_
             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
@@ -205,17 +206,17 @@ void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
  resetInGrad(in);
 }
 void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  CHECK(outVal_) << "Should have output value";
-  out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
-
-  // create reorder if output value has cpu device and pd do not match
  cpuOutGrad_ = nullptr;
  cvtOutGrad_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
+  CHECK(outVal_);
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
    cpuOutGrad_ = MKLDNNMatrix::create(
        cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
-    if (cpuOutGrad_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
      CHECK(cvtOutGrad_) << "should not be emptry";
    } else {
@@ -228,12 +229,11 @@ void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {

 void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
  in = nullptr;
-  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
-  if (inGrad == nullptr) {
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
    return;
  }
  CHECK(inVal_);
-  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }

 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
@@ -261,7 +261,6 @@ void MKLDNNPoolLayer::resetBwdPipeline(
    std::shared_ptr<pool_bwd::primitive_desc>& pd,
    MKLDNNMatrixPtr& in,
    MKLDNNMatrixPtr& out) {
-  pipeline.clear();
  if (cvtOutGrad_) {
    pipeline.push_back(*cvtOutGrad_);
  }

--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -124,8 +124,8 @@ void MKLDNNTester::randomTopDiffs() {
 void MKLDNNTester::checkForward() {
  VLOG(MKLDNN_ALL) << "Check Forward";
  printTopDatas();
-  double delta = compareMatrix(dnnLayer_->getOutput(CPU_DEVICE).value,
-                               refLayer_->getOutputValue());
+  double delta =
+      compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
  EXPECT_LE(fabs(delta), eps_);
 }


--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -134,7 +134,7 @@ void CondOp::PrepareDataForSubnet(
  for (int i = 0; i < BRANCH_NUM; ++i) {
    for (auto& output : (*sub_net_op_[i]).Outputs()) {
      for (auto& var_name : output.second) {
-        sub_scopes[i]->NewVar(var_name);
+        sub_scopes[i]->Var(var_name);
      }
    }
  }

--- a/paddle/operators/dynamic_recurrent_op.cc
+++ b/paddle/operators/dynamic_recurrent_op.cc
@@ -30,7 +30,7 @@ namespace detail {
 inline void CreateVariables(Scope& scope,
                            const std::vector<std::string>& var_names) {
  for (const auto& name : var_names) {
-    scope.NewVar(name);
+    scope.Var(name);
  }
 }

@@ -136,7 +136,7 @@ void DynamicRecurrentOp::WriteStepInputs() const {
      auto& step_scope = cache_.GetScope(step);
      Variable* var = step_scope.FindVar(item.first);
      if (var == nullptr) {
-        var = step_scope.NewVar(item.first);
+        var = step_scope.Var(item.first);
      }
      var->GetMutable<LoDTensor>()->ShareDataWith<value_type>(tensor);
    }

--- a/paddle/operators/dynamic_recurrent_op_test.cc
+++ b/paddle/operators/dynamic_recurrent_op_test.cc
@@ -36,7 +36,7 @@ void OpDescNewVar(const std::string& param_name,
 // create a LoD tensor in scope with specific dims
 LoDTensor* CreateVar(Scope& scope, std::string name, framework::DDim dims,
                     const platform::Place& place) {
-  auto* var = scope.NewVar(name);
+  auto* var = scope.Var(name);
  auto* tensor = var->GetMutable<LoDTensor>();
  tensor->Resize(dims);
  tensor->mutable_data<float>(place);
@@ -85,7 +85,7 @@ class DynamicRecurrentOpTestHelper : public ::testing::Test {

  void CreateGlobalVariables() {
    platform::CPUPlace place;
-    scope.NewVar("step_scopes");
+    scope.Var("step_scopes");
    CreateVar(scope, "boot_mem", framework::make_ddim({10, 20}), place);
    CreateVar(scope, "out0", framework::make_ddim({10, 20}), place);
    auto* in0 = CreateVar(scope, "in0", framework::make_ddim({10, 8}), place);

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -70,14 +70,14 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope,
        // the weight are located in parent scope
        for (auto& var_name : input.second) {
          if (!step_scope.FindVar(var_name)) {
-            step_scope.NewVar(var_name)->GetMutable<LoDTensor>();
+            step_scope.Var(var_name)->GetMutable<LoDTensor>();
          }
        }
      }
      // create stepnet's outputs
      for (const auto& output : (*stepnet_)->Outputs()) {
        for (auto& var_name : output.second) {
-          step_scope.NewVar(var_name);
+          step_scope.Var(var_name);
        }
      }
      step_scopes->emplace_back(&step_scope);
@@ -87,7 +87,7 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope,

 void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
  for (auto& attr : arg_->memories) {
-    auto* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<LoDTensor>();
+    auto* pre_mem = step_scope->Var(attr.pre_var)->GetMutable<LoDTensor>();
    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                   "memory [%s]'s boot variable [%s] not exists", attr.var,
                   attr.boot_var);
@@ -167,9 +167,9 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
                   "memory variable [%s] does not exists", attr.var);
    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                   "boot variable [%s] does not exists", attr.boot_var);
-    auto* mem_grad = step_scope->NewVar(attr.var)->GetMutable<LoDTensor>();
+    auto* mem_grad = step_scope->Var(attr.var)->GetMutable<LoDTensor>();
    auto* boot_mem_grad =
-        step_scope->NewVar(attr.boot_var)->GetMutable<LoDTensor>();
+        step_scope->Var(attr.boot_var)->GetMutable<LoDTensor>();
    boot_mem_grad->Resize(mem_grad->dims());
    boot_mem_grad->ShareDataWith<float>(*mem_grad);
  }

--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -40,7 +40,7 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
    f::DDim step_dims = slice_ddim(dims, 1, dims.size());
    for (size_t j = 0; j < seq_len; j++) {
      Tensor* step_input =
-          step_scopes[j]->NewVar(inlinks[i])->GetMutable<Tensor>();
+          step_scopes[j]->Var(inlinks[i])->GetMutable<Tensor>();
      // The input of operators of each step is Tensor here.
      // Maybe need to modify Slice function.
      *step_input = input->Slice<float>(j, j + 1);

--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
 if(WITH_PYTHON)
  cc_library(paddle_pybind SHARED
    SRCS pybind.cc exception.cc protobuf.cc
-    DEPS pybind python backward proto_desc tensor_array paddle_memory
+    DEPS pybind python backward proto_desc tensor_array paddle_memory executor
    ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -120,7 +120,19 @@ void BindProgramDesc(py::module &m) {
      .def("append_backward",
           [](ProgramDescBind &program_desc, const VarDescBind &target,
              const std::unordered_set<std::string> &no_grad_vars) {
-             AppendBackward(program_desc, target, no_grad_vars);
+             ParamGradInfoMap param_grad_map =
+                 AppendBackward(program_desc, target, no_grad_vars);
+             std::unordered_map<
+                 std::string, std::tuple<std::string /* grad_var_name */,
+                                         int /* block_idx */, int /* op_idx */>>
+                 retv;
+             for (auto it = param_grad_map.begin(); it != param_grad_map.end();
+                  ++it) {
+               const auto &grad_info = it->second;
+               retv[it->first] = std::make_tuple(
+                   grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
+             }
+             return retv;
           })
      .def("block", &ProgramDescBind::Block, py::return_value_policy::reference)
      .def("num_blocks", &ProgramDescBind::Size)
@@ -145,16 +157,16 @@ void BindBlockDesc(py::module &m) {
           py::return_value_policy::reference)
      .def("prepend_op", &BlockDescBind::PrependOp,
           py::return_value_policy::reference)
-      .def("new_var",
+      .def("var",
           [](BlockDescBind &self, py::bytes byte_name) {
             std::string name = byte_name;
-             return self.NewVar(name);
+             return self.Var(name);
           },
           py::return_value_policy::reference)
-      .def("var",
+      .def("find_var",
           [](BlockDescBind &self, py::bytes byte_name) {
             std::string name = byte_name;
-             return self.Var(name);
+             return self.FindVar(name);
           },
           py::return_value_policy::reference)
      .def("all_vars", &BlockDescBind::AllVars,

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/pybind/protobuf.h"

 #include "paddle/framework/backward.h"
+#include "paddle/framework/executor.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/tensor_array.h"
 #include "paddle/operators/cond_op.h"
@@ -164,9 +165,9 @@ All parameter, weight, gradient are variables in Paddle.
           py::return_value_policy::reference);

  py::class_<Scope>(m, "Scope", "")
-      .def("new_var",
+      .def("var",
           [](Scope &self, const std::string &name) -> Variable * {
-             return self.NewVar(name);
+             return self.Var(name);
           },
           py::return_value_policy::reference)
      .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
@@ -391,6 +392,14 @@ All parameter, weight, gradient are variables in Paddle.
             self.set_falsenet(net.Clone());
           });

+  py::class_<framework::Executor>(m, "Executor")
+      .def(py::init<std::vector<platform::Place> &>())
+      .def("run",
+           [](Executor &self, const ProgramDesc &program_desc, int block_id) {
+             framework::Scope &global_scope = GetGlobalScope();
+             self.Run(program_desc, &global_scope, block_id);
+           });
+
  m.def("unique_integer", UniqueIntegerGenerator);

  m.def("is_compile_gpu", IsCompileGPU);

--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -39,15 +39,18 @@ add_test(NAME test_CompareTwoNets

 ################ test_CompareMKLDNNandCPU ######################
 if(WITH_MKLDNN)
-  add_unittest_without_exec(test_CompareMKLDNNandCPU
-      test_CompareTwoNets.cpp)
-  add_test(NAME test_CompareMKLDNNandCPU
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-          ${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU
-              --config_file_a=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_a=True
-              --config_file_b=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_b=False
-              --use_gpu=False
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+  macro(gen_command VAR_NAME CONFIG_FILE)
+    set(${VAR_NAME} "${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh" "-d" "${PADDLE_SOURCE_DIR}/python/"
+                    "${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU --use_gpu=False"
+                    "--config_file_a=trainer/tests/${CONFIG_FILE} --use_mkldnn_a=True"
+                    "--config_file_b=trainer/tests/${CONFIG_FILE} --use_mkldnn_b=False"
+                    "WORKING_DIRECTORY" "${PADDLE_SOURCE_DIR}/paddle/")
+  endmacro()
+  add_unittest_without_exec(test_CompareMKLDNNandCPU test_CompareTwoNets.cpp)
+  gen_command(compare_simple_net "sample_trainer_config_simple_net.conf")
+  gen_command(compare_branch_net "sample_trainer_config_branch_net.conf")
+  add_test(NAME test_CompareMKLDNNandCPU_simple_net COMMAND ${compare_simple_net})
+  add_test(NAME test_CompareMKLDNNandCPU_branch_net COMMAND ${compare_branch_net})
 endif()

 ############### test_CompareTwoOpts ###################

--- a/paddle/trainer/tests/sample_trainer_config_branch_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+################################### Data Configuration ###################################
+TrainData(ProtoData(files = "trainer/tests/mnist.list"))
+################################### Algorithm Configuration ###################################
+settings(batch_size = 256,
+         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
+################################### Network Configuration ###################################
+data = data_layer(name ="input", size=784)
+
+tmp = img_conv_layer(input=data,
+            num_channels=1,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+a1 = img_conv_layer(input=tmp,
+            filter_size=1,
+            num_filters=32,
+            padding=0,
+            shared_biases=True,
+            act=ReluActivation())
+
+a2 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+tmp = concat_layer(input=[a1, a2])
+
+tmp = img_pool_layer(input=tmp,
+            num_channels=64,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=AvgPooling())
+
+b1 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=64,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+b1 = img_pool_layer(input=b1,
+            pool_size=3,
+            stride=1,
+            padding=1,
+            pool_type=MaxPooling())
+
+b2 = img_conv_layer(input=tmp,
+            filter_size=5,
+            num_filters=64,
+            padding=2,
+            shared_biases=True,
+            act=ReluActivation())
+
+b2 = img_pool_layer(input=b2,
+            pool_size=5,
+            stride=1,
+            padding=2,
+            pool_type=MaxPooling())
+
+tmp = addto_layer(input=[b1, b2],
+            act=ReluActivation(),
+            bias_attr=False)
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+
+tmp = fc_layer(input=tmp, size=64,
+            bias_attr=False,
+            act=TanhActivation())
+
+output = fc_layer(input=tmp, size=10,
+            bias_attr=True,
+            act=SoftmaxActivation())
+
+lbl = data_layer(name ="label", size=10)
+
+cost = classification_cost(input=output, label=lbl)
+outputs(cost)
--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/framework/default_scope_funcs.py
@@ -5,7 +5,7 @@ Default scope function.
 thread-local stack of Scope. Top of that stack is current scope, the bottom 
 of that stack is all scopes' parent. 

-Invoking `new_var/find_var`  can `new/find` variable in current scope. 
+Invoking `var/find_var`  can `new/find` variable in current scope. 
 Invoking `enter_local_scope/leave_local_scope` can create or destroy local 
 scope. 

@@ -19,7 +19,7 @@ import threading
 __tl_scope__ = threading.local()

 __all__ = [
-    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'new_var',
+    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'var',
    'find_var', 'scoped_function'
 ]

@@ -54,11 +54,11 @@ def leave_local_scope():
    get_cur_scope().drop_kids()


-def new_var(name):
+def var(name):
    """
    create variable in current scope.
    """
-    return get_cur_scope().new_var(name)
+    return get_cur_scope().var(name)


 def find_var(name):

--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
@@ -20,11 +20,11 @@ class Variable(object):

        if name is None:
            name = Variable._unique_var_name_()
-        try:
+        is_new_var = False
+        self.desc = self.block.desc.find_var(name)
+
+        if self.desc is None:
            self.desc = self.block.desc.var(name)
-            is_new_var = False
-        except core.EnforceNotMet:
-            self.desc = self.block.desc.new_var(name)
            is_new_var = True

        if is_new_var:

--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -14,7 +14,7 @@ def create_op(scope, op_type, inputs, outputs, attrs):
    kwargs = dict()

    def __create_var__(name, var_name):
-        scope.new_var(var_name)
+        scope.var(var_name)
        kwargs[name].append(var_name)

    for in_name, in_dup in Operator.get_op_inputs(op_type):
@@ -71,7 +71,7 @@ def set_input(scope, op, inputs, place):
 def set_output_grad(scope, op, outputs, place):
    def __set_tensor__(name):
        out_tensor = scope.find_var(name).get_tensor()
-        grad_tensor = scope.new_var(grad_var_name(name)).get_tensor()
+        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
        out_dtype = out_tensor.dtype()
        if out_dtype == core.DataType.FP64:
            data = np.ones(out_tensor.shape(), dtype=np.float64)
@@ -169,10 +169,10 @@ def get_numeric_gradient(scope,
 def get_backward_op(scope, op, no_grad_set):
    backward_op = core.Operator.backward(op, no_grad_set)
    for input in backward_op.input_vars():
-        var = scope.new_var(input)
+        var = scope.var(input)
        var.get_tensor()
    for output in backward_op.output_vars():
-        var = scope.new_var(output)
+        var = scope.var(output)
        var.get_tensor()
    return backward_op


--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
@@ -39,7 +39,7 @@ class PySimpleCondTest(unittest.TestCase):


 def create_tensor(scope, name, shape, np_data):
-    tensor = scope.new_var(name).get_tensor()
+    tensor = scope.var(name).get_tensor()
    tensor.set_dims(shape)
    tensor.set(np_data, core.CPUPlace())
    return tensor
@@ -74,9 +74,9 @@ class TestCondOp(unittest.TestCase):
        create_tensor(self.scope, "X", [10, 1], x_np_data)
        cond_np_data = self.py_cond.cond.astype("int32")
        create_tensor(self.scope, "cond", [10, 1], cond_np_data)
-        self.scope.new_var("SubScopes")
-        self.scope.new_var("IndexTensors")
-        self.scope.new_var("Out")
+        self.scope.var("SubScopes")
+        self.scope.var("IndexTensors")
+        self.scope.var("Out")

    def create_cond_op(self):
        self.condop = CondOp(

--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
@@ -10,7 +10,7 @@ class TestDefaultScopeFuncs(unittest.TestCase):
        self.assertIsNone(find_var("test"))

    def test_create_var_get_var(self):
-        var_a = new_var("var_a")
+        var_a = var("var_a")
        self.assertIsNotNone(var_a)
        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
        enter_local_scope()
@@ -19,7 +19,7 @@ class TestDefaultScopeFuncs(unittest.TestCase):

    def test_var_get_int(self):
        def __new_scope__():
-            i = new_var("var_i")
+            i = var("var_i")
            self.assertFalse(i.is_int())
            i.set_int(10)
            self.assertTrue(i.is_int())

--- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
@@ -6,7 +6,7 @@ import numpy as np


 def create_tensor(scope, name, shape, np_data):
-    tensor = scope.new_var(name).get_tensor()
+    tensor = scope.var(name).get_tensor()
    tensor.set_dims(shape)
    tensor.set(np_data, core.CPUPlace())
    return tensor
@@ -72,8 +72,8 @@ class DynamicRecurrentOpTest(unittest.TestCase):
        create_tensor(self.scope, "U", [self.input_dim, self.input_dim], U)
        create_tensor(self.scope, "h_boot", [self.num_sents, self.input_dim],
                      h_boot)
-        self.scope.new_var("step_scopes")
-        self.scope.new_var("h@mem")
+        self.scope.var("step_scopes")
+        self.scope.var("h@mem")

    def create_rnn_op(self):
        # create RNNOp

--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
@@ -14,7 +14,7 @@ class TestGaussianRandomOp(unittest.TestCase):

    def gaussian_random_test(self, place):
        scope = core.Scope()
-        scope.new_var('Out').get_tensor()
+        scope.var('Out').get_tensor()

        op = Operator(
            "gaussian_random",

--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/framework/tests/test_infer_shape.py
@@ -13,14 +13,14 @@ class TestInferShape(unittest.TestCase):
        shape = [10, 20]

        # prepare input/output
-        x1 = block.new_var("x1")
+        x1 = block.var("x1")
        x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
        x1.set_shape(shape)
-        x2 = block.new_var("x2")
+        x2 = block.var("x2")
        x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
        x2.set_shape(shape)

-        out = block.new_var("out")
+        out = block.var("out")
        out.set_type(core.VarDesc.VarType.LOD_TENSOR)

        # prepare the operator
@@ -42,14 +42,14 @@ class TestInferShape(unittest.TestCase):
        y_shape = [20, 30]

        # prepare input/output
-        x1 = block.new_var("x")
+        x1 = block.var("x")
        x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
        x1.set_shape(x_shape)
-        x2 = block.new_var("y")
+        x2 = block.var("y")
        x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
        x2.set_shape(y_shape)

-        out = block.new_var("out")
+        out = block.var("out")
        out.set_type(core.VarDesc.VarType.LOD_TENSOR)

        # prepare the operator

--- a/python/paddle/v2/framework/tests/test_mnist.py
+++ b/python/paddle/v2/framework/tests/test_mnist.py
@@ -31,7 +31,7 @@ uniq_id = atomic_id().next


 def data_layer(name, dims):
-    var = scope.new_var(name)
+    var = scope.var(name)
    tensor = var.get_tensor()
    tensor.set_dims(dims)  # 1 is batch size holder.
    return name
@@ -67,7 +67,7 @@ def sgd_optimizer(net, param_name, learning_rate=0.005):

 # should use operator and add these to the init_network
 def init_param(net, param_name, dims):
-    scope.new_var(param_name)
+    scope.var(param_name)
    op = Operator(
        "uniform_random", Out=param_name, dims=dims, min=-0.5, max=0.5, seed=10)
    op.infer_shape(scope)
@@ -104,7 +104,7 @@ def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
    sgd_optimizer(net=optimize_net, param_name=w_name, learning_rate=0.01)

    pre_activation = name + ".mul.out"
-    scope.new_var(pre_activation)
+    scope.var(pre_activation)
    mul_op = Operator("mul", X=input, Y=w_name, Out=pre_activation)
    net.append_op(mul_op)

@@ -115,7 +115,7 @@ def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
        sgd_optimizer(
            net=optimize_net, param_name=bias_name, learning_rate=0.001)
        bias_out = name + ".rowwise_add.out"
-        scope.new_var(bias_out)
+        scope.var(bias_out)
        rowwise_append_op = Operator(
            "rowwise_add", X=pre_activation, b=bias_name, Out=bias_out)
        net.append_op(rowwise_append_op)
@@ -123,7 +123,7 @@ def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):

    activation_op = Operator(act, X=pre_activation, Y=name)
    net.append_op(activation_op)
-    scope.new_var(name)
+    scope.var(name)
    net.infer_shape(scope)
    return name

@@ -133,7 +133,7 @@ def cross_entropy_layer(net, input, label):
    cross_entropy_op = Operator(
        "cross_entropy", X=input, Label=label, Y=cost_name)
    net.append_op(cross_entropy_op)
-    scope.new_var(cost_name)
+    scope.var(cost_name)
    net.infer_shape(scope)
    return cost_name

@@ -141,10 +141,10 @@ def cross_entropy_layer(net, input, label):
 def create_backward_net(forward_net):
    net = core.Operator.backward(forward_net, set())
    for input in net.inputs()["all"]:
-        var = scope.new_var(input)
+        var = scope.var(input)
        var.get_tensor()
    for output in net.outputs()["all"]:
-        var = scope.new_var(output)
+        var = scope.var(output)
        var.get_tensor()
    return net


--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/framework/tests/test_program.py
@@ -51,17 +51,24 @@ class TestProgram(unittest.TestCase):
        sum_op_desc.set_input("Y", ["b1"])
        sum_op_desc.set_output("Out", ["out2"])

-        target = block.new_var("out2")
+        target = block.var("out2")

        expect_ops = [
            "mul", "elementwise_add", "fill_constant", "elementwise_add_grad",
            "mul_grad"
        ]
+
+        def grad_name(name):
+            return name + "@GRAD"
+
        actual_ops = []
-        prog.append_backward(target, set())
+        param_to_grad = prog.append_backward(target, set())
+        for var_name in ("x1", "y1", "out1", "b1"):
+            self.assertEqual(param_to_grad[var_name][0], grad_name(var_name))
+            self.assertEqual(param_to_grad[var_name][1], 0)
+
        for op in block.all_ops():
            actual_ops.append(op.type())
-        print(actual_ops)
        self.assertEqual(actual_ops, expect_ops)



--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -93,7 +93,7 @@ class TestVarDesc(unittest.TestCase):
    def test_shape(self):
        program_desc = core.ProgramDesc.__create_program_desc__()
        block = program_desc.block(0)
-        var = block.new_var('my_var')
+        var = block.var('my_var')
        var.set_type(core.VarDesc.VarType.SELECTED_ROWS)
        src_shape = [3, 2, 10, 8]
        var.set_shape(src_shape)
@@ -104,7 +104,7 @@ class TestVarDesc(unittest.TestCase):
    def test_data_type(self):
        program_desc = core.ProgramDesc.__create_program_desc__()
        block = program_desc.block(0)
-        var = block.new_var('my_var')
+        var = block.var('my_var')
        var.set_type(core.VarDesc.VarType.LOD_TENSOR)
        var.set_data_type(core.DataType.INT32)
        self.assertEqual(core.DataType.INT32, var.data_type())
@@ -117,12 +117,12 @@ class TestBlockDesc(unittest.TestCase):
        self.assertIsNotNone(prog)
        block = prog.block(0)
        self.assertIsNotNone(block)
-        var1 = block.new_var("var1")
-        var2 = block.new_var("var2")
-        var3 = block.new_var("var3")
+        var1 = block.var("var1")
+        var2 = block.var("var2")
+        var3 = block.var("var3")
        all_vars = block.all_vars()
        self.assertEqual(set(all_vars), set([var1, var2, var3]))
-        var2_re = block.var("var2")
+        var2_re = block.find_var("var2")
        self.assertEqual(var2_re, var2)

    def test_add_op(self):

--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -66,7 +66,7 @@ class PySimpleRNNTest(unittest.TestCase):


 def create_tensor(scope, name, shape, np_data):
-    tensor = scope.new_var(name).get_tensor()
+    tensor = scope.var(name).get_tensor()
    tensor.set_dims(shape)
    tensor.set(np_data, core.CPUPlace())
    return tensor
@@ -125,8 +125,8 @@ class RecurrentOpTest(unittest.TestCase):
        h_boot_np_data = self.py_rnn.h_boot
        create_tensor(self.scope, "h_boot", [self.batch_size, self.input_dim],
                      h_boot_np_data)
-        self.scope.new_var("step_scopes")
-        self.scope.new_var("h@mem")
+        self.scope.var("step_scopes")
+        self.scope.var("h@mem")

    def create_rnn_op(self):
        # create RNNOp

--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
@@ -18,7 +18,7 @@ class TestScope(unittest.TestCase):
    def test_create_var_get_var(self):
        paddle_c = paddle.v2.framework.core
        scope = paddle_c.Scope()
-        var_a = scope.new_var("var_a")
+        var_a = scope.var("var_a")
        self.assertIsNotNone(var_a)
        self.assertIsNotNone(scope.find_var('var_a'))
        scope2 = scope.new_scope()
@@ -27,7 +27,7 @@ class TestScope(unittest.TestCase):
    def test_var_get_int(self):
        paddle_c = paddle.v2.framework.core
        scope = paddle_c.Scope()
-        var = scope.new_var("test_int")
+        var = scope.var("test_int")
        var.set_int(10)
        self.assertTrue(var.is_int())
        self.assertEqual(10, var.get_int())

--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
@@ -6,7 +6,7 @@ import numpy
 class TestTensor(unittest.TestCase):
    def test_int_tensor(self):
        scope = core.Scope()
-        var = scope.new_var("test_tensor")
+        var = scope.var("test_tensor")
        place = core.CPUPlace()

        tensor = var.get_tensor()
@@ -25,7 +25,7 @@ class TestTensor(unittest.TestCase):

    def test_float_tensor(self):
        scope = core.Scope()
-        var = scope.new_var("test_tensor")
+        var = scope.var("test_tensor")
        place = core.CPUPlace()

        tensor = var.get_tensor()
@@ -46,7 +46,7 @@ class TestTensor(unittest.TestCase):
    def test_int_lod_tensor(self):
        place = core.CPUPlace()
        scope = core.Scope()
-        var_lod = scope.new_var("test_lod_tensor")
+        var_lod = scope.var("test_lod_tensor")
        lod_tensor = var_lod.get_tensor()

        lod_tensor.set_dims([4, 4, 6])
@@ -68,7 +68,7 @@ class TestTensor(unittest.TestCase):
    def test_float_lod_tensor(self):
        place = core.CPUPlace()
        scope = core.Scope()
-        var_lod = scope.new_var("test_lod_tensor")
+        var_lod = scope.var("test_lod_tensor")

        lod_tensor = var_lod.get_tensor()
        lod_tensor.set_dims([5, 2, 3, 4])

--- a/python/paddle/v2/framework/tests/test_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_tensor_array.py
@@ -13,7 +13,7 @@ class TestTensorArray(unittest.TestCase):

        # create a LoDTensor
        self.scope = core.Scope()
-        var = self.scope.new_var("test_tensor")
+        var = self.scope.var("test_tensor")
        self.place = core.CPUPlace()
        tensor = var.get_tensor()
        tensor.set_dims([self.batch_size, self.dim])
@@ -51,7 +51,7 @@ class TestTensorArray(unittest.TestCase):
        self.ta.unstack(self.tensor)

        # create a tensor with shape of [1, self.dim]
-        var = self.scope.new_var("hell")
+        var = self.scope.var("hell")
        tensor = var.get_tensor()
        tensor.set_dims([1, self.dim])
        tensor.alloc_float(self.place)
@@ -71,7 +71,7 @@ class TestTensorArray(unittest.TestCase):
        self.ta.unstack(self.tensor)

        # create a tensor with shape of [1, self.dim]
-        var = self.scope.new_var("hell")
+        var = self.scope.var("hell")
        tensor = var.get_tensor()
        tensor.set_dims([1, self.dim])
        tensor.alloc_float(self.place)

--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
@@ -14,7 +14,7 @@ class TestUniformRandomOp(unittest.TestCase):

    def uniform_random_test(self, place):
        scope = core.Scope()
-        scope.new_var('X').get_tensor()
+        scope.var('X').get_tensor()

        op = Operator(
            "uniform_random",