diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 2cdf323c53a8ba729ec74c1eacb9fa3ef272f44a..ad219887d6d35c46cee53da1f04014c9bd23fb04 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -18,8 +18,8 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
-
-cc_library(operator SRCS operator.cc DEPS framework_proto device_context tensor scope attribute)
+cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 
 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
@@ -56,5 +56,6 @@ cc_library(paddle_pybind SHARED
     recurrent_op
     uniform_random_op
     gaussian_random_op
-    fill_zeros_like_op)
+    fill_zeros_like_op
+    scale_op)
 endif(WITH_PYTHON)
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index b93ab66f2f5b9cffa6d51b6e36afe552125970e4..f100c4d05489ac3bd4ceb5f11ae871985f0e5d83 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -72,8 +72,8 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker {
 
 class FcOp : public operators::NetOp {
  public:
-  FcOp(const std::string &type, const VarNameMap &inputs,
-       const VarNameMap &outputs, const AttributeMap &attrs)
+  FcOp(const std::string &type, const VariableNameMap &inputs,
+       const VariableNameMap &outputs, const AttributeMap &attrs)
       : NetOp(type, inputs, outputs, attrs) {
     AppendOp(OpRegistry::CreateOp("mul",
                                   {{"X", {Input("X")}}, {"Y", {Input("W")}}},
diff --git a/paddle/framework/grad_op_builder.cc b/paddle/framework/grad_op_builder.cc
index 0a2a41f6b62658ac8633a6e384d099f8d6641f33..b02a599a800668b22e7fe39a10fa6dc132e305bd 100644
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@@ -20,13 +20,13 @@ namespace framework {
 enum class OpArgType { IN, OUT };
 
 static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
-                       bool is_grad, OperatorBase::VarNameMap* vars) {
+                       bool is_grad, VariableNameMap* vars) {
   const auto& src_inout =
       src_type == OpArgType::IN ? src_op->Inputs() : src_op->Outputs();
   auto& dst_inout = *vars;
-  const OpProto* proto = OpRegistry::op_info_map().at(src_op->Type()).proto_;
+  auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
   const auto& src_arg_list =
-      src_type == OpArgType::IN ? proto->inputs() : proto->outputs();
+      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
   for (const auto& arg : src_arg_list) {
     if (arg.not_in_gradient() && !is_grad) continue;
     const std::string src_name = arg.name();
@@ -40,26 +40,18 @@ static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
 }
 
 OperatorBase* BuildGradOp(const OperatorBase* op) {
-  auto it = OpRegistry::op_info_map().find(op->Type());
-  PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(),
-                 "'%s' has not been registered.", op->Type());
-  PADDLE_ENFORCE(it->second.proto_ != nullptr, "'%s' has no OpProto.",
-                 op->Type());
-  std::string grad_op_type = it->second.grad_op_type_;
-  PADDLE_ENFORCE(!grad_op_type.empty(), "'%s' has no gradient operator.",
-                 op->Type());
+  auto& info = OpInfoMap::Instance().Get(op->Type());
+  PADDLE_ENFORCE(info.HasGradientOp());
 
-  OperatorBase::VarNameMap inputs;
-  OperatorBase::VarNameMap outputs;
+  VariableNameMap inputs;
+  VariableNameMap outputs;
   TransOpArg(op, OpArgType::IN, false, &inputs);   // I
   TransOpArg(op, OpArgType::OUT, false, &inputs);  // O
   TransOpArg(op, OpArgType::OUT, true, &inputs);   // OG
   TransOpArg(op, OpArgType::IN, true, &outputs);   // IG
 
-  it = OpRegistry::op_info_map().find(grad_op_type);
-  PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(),
-                 "'%s' has not been registered.", grad_op_type);
-  return it->second.creator_(grad_op_type, inputs, outputs, op->Attrs());
+  auto& grad_info = OpInfoMap::Instance().Get(info.grad_op_type_);
+  return grad_info.Creator()(info.grad_op_type_, inputs, outputs, op->Attrs());
 }
 
 }  // namespace framework
diff --git a/paddle/framework/op_info.cc b/paddle/framework/op_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81ba29797c5f478e5d6a91236f3e8de1e6b43e49
--- /dev/null
+++ b/paddle/framework/op_info.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_info.h"
+
+namespace paddle {
+namespace framework {
+
+static OpInfoMap* g_op_info_map = nullptr;
+
+OpInfoMap& OpInfoMap::Instance() {
+  if (g_op_info_map == nullptr) {
+    g_op_info_map = new OpInfoMap();
+  }
+  return *g_op_info_map;
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..94245c6c44aca962b0db890947a9dc5550ac0799
--- /dev/null
+++ b/paddle/framework/op_info.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <functional>
+#include <map>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/framework/attribute.h"
+
+namespace paddle {
+namespace framework {
+class OperatorBase;
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+
+using OpCreator = std::function<OperatorBase*(
+    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
+    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
+
+struct OpInfo {
+  OpCreator creator_;
+  std::string grad_op_type_;
+  OpProto* proto_;
+  OpAttrChecker* checker_;
+
+  bool HasOpProtoAndChecker() const {
+    return proto_ != nullptr && checker_ != nullptr;
+  }
+
+  const OpProto& Proto() const {
+    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
+    PADDLE_ENFORCE(proto_->IsInitialized(),
+                   "Operator Proto must be initialized in op info");
+    return *proto_;
+  }
+
+  const OpAttrChecker& Checker() const {
+    PADDLE_ENFORCE_NOT_NULL(checker_,
+                            "Operator Checker has not been registered");
+    return *checker_;
+  }
+
+  const OpCreator& Creator() const {
+    PADDLE_ENFORCE_NOT_NULL(creator_,
+                            "Operator Creator has not been registered");
+    return creator_;
+  }
+
+  bool HasGradientOp() const { return !grad_op_type_.empty(); }
+};
+
+class OpInfoMap {
+ public:
+  static OpInfoMap& Instance();
+
+  OpInfoMap(const OpInfoMap& o) = delete;
+  OpInfoMap(OpInfoMap&& o) = delete;
+  OpInfoMap& operator=(const OpInfoMap& o) = delete;
+  OpInfoMap& operator=(OpInfoMap&& o) = delete;
+
+  bool Has(const std::string& op_type) const {
+    return map_.find(op_type) != map_.end();
+  }
+
+  void Insert(const std::string& type, const OpInfo& info) {
+    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
+    map_.insert({type, info});
+  }
+
+  const OpInfo& Get(const std::string& type) const {
+    auto it = map_.find(type);
+    PADDLE_ENFORCE(it != map_.end(), "Operator %s are not found", type);
+    return it->second;
+  }
+
+  template <typename Callback>
+  void IterAllInfo(Callback callback) {
+    for (auto& it : map_) {
+      callback(it.first, it.second);
+    }
+  }
+
+ private:
+  OpInfoMap() = default;
+  std::unordered_map<std::string, const OpInfo> map_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
index 8eae86e9605da74cdc37caeb9569e7500aac2a63..b0e85dd49f97da4a7f889fde0b5f060954947be8 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -19,32 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const std::string& type,
-                                                   const VarNameMap& inputs,
-                                                   const VarNameMap& outputs,
-                                                   AttributeMap attrs) {
-  auto it = op_info_map().find(type);
-  PADDLE_ENFORCE(it != op_info_map().end(),
-                 "Operator '%s' has not been registered.", type);
-  it->second.checker_->Check(attrs);
-  auto op = it->second.creator_(type, inputs, outputs, attrs);
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
+    const std::string& type, const VariableNameMap& inputs,
+    const VariableNameMap& outputs, AttributeMap attrs) {
+  auto& info = OpInfoMap::Instance().Get(type);
+  info.Checker().Check(attrs);
+  auto op = info.Creator()(type, inputs, outputs, attrs);
   return std::unique_ptr<OperatorBase>(op);
 }
 
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
-  VarNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
-  VarNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
-  AttributeMap attrs;
-  for (auto& attr : op_desc.attrs()) {
-    attrs[attr.name()] = GetAttrValue(attr);
-  }
-
-  return CreateOp(op_desc.type(), inputs, outputs, attrs);
-}
-
-OperatorBase::VarNameMap OpRegistry::ConvertOpDescVarsToVarNameMap(
+static VariableNameMap ConvertOpDescVarsToVarNameMap(
     const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
-  VarNameMap ret_val;
+  VariableNameMap ret_val;
   for (auto& var : op_desc_vars) {
     auto& var_names = ret_val[var.parameter()];
     auto& var_names_in_proto = var.arguments();
@@ -55,6 +41,17 @@ OperatorBase::VarNameMap OpRegistry::ConvertOpDescVarsToVarNameMap(
   return ret_val;
 }
 
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+  VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
+  VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
+  AttributeMap attrs;
+  for (auto& attr : op_desc.attrs()) {
+    attrs[attr.name()] = GetAttrValue(attr);
+  }
+
+  return CreateOp(op_desc.type(), inputs, outputs, attrs);
+}
+
 std::unique_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) {
   PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops");
   return std::unique_ptr<OperatorBase>(BuildGradOp(&op));
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 4c2d13d639005d2d2710c19f63988333d89bce13..2d09cde41e3f5086279f9441e0fdc52549bed5ab 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/op_info.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
 
@@ -30,28 +31,16 @@ namespace paddle {
 namespace framework {
 
 class OpRegistry {
-  using VarNameMap = OperatorBase::VarNameMap;
-  using OpCreator = std::function<OperatorBase*(
-      const std::string& /*type*/, const VarNameMap& /*inputs*/,
-      const VarNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
-
  public:
-  struct OpInfo {
-    OpCreator creator_;
-    std::string grad_op_type_;
-    OpProto* proto_;
-    OpAttrChecker* checker_;
-  };
-
   template <typename OpType, typename ProtoMakerType, typename GradOpType>
   static void RegisterOp(const std::string& op_type,
                          const std::string& grad_op_type) {
-    PADDLE_ENFORCE(op_info_map().count(op_type) == 0,
+    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
                    "'%s' is registered more than once.", op_type);
     OpInfo op_info;
-    op_info.creator_ = [](const std::string& type, const VarNameMap& inputs,
-                          const VarNameMap& outputs,
-                          const AttributeMap& attrs) {
+    op_info.creator_ = [](
+        const std::string& type, const VariableNameMap& inputs,
+        const VariableNameMap& outputs, const AttributeMap& attrs) {
       return new OpType(type, inputs, outputs, attrs);
     };
     op_info.grad_op_type_ = grad_op_type;
@@ -70,7 +59,7 @@ class OpRegistry {
       op_info.proto_ = nullptr;
       op_info.checker_ = nullptr;
     }
-    op_info_map().insert(std::make_pair(op_type, op_info));
+    OpInfoMap::Instance().Insert(op_type, op_info);
     // register gradient op
     if (!grad_op_type.empty()) {
       RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
@@ -78,21 +67,13 @@ class OpRegistry {
   }
 
   static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
-                                                const VarNameMap& inputs,
-                                                const VarNameMap& outputs,
+                                                const VariableNameMap& inputs,
+                                                const VariableNameMap& outputs,
                                                 AttributeMap attrs);
 
   static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
 
-  static VarNameMap ConvertOpDescVarsToVarNameMap(
-      const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars);
-
   static std::unique_ptr<OperatorBase> CreateGradOp(const OperatorBase& op);
-
-  static std::unordered_map<std::string, const OpInfo>& op_info_map() {
-    static std::unordered_map<std::string, const OpInfo> op_info_map_;
-    return op_info_map_;
-  }
 };
 
 class Registrar {
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index eadd8f3316ff1ebffb94a56b2e62d661e4e0b38f..7abbde610f1e9c530393b9a9cabe40b826712212 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -115,8 +115,8 @@ void OperatorBase::Rename(const std::string& old_name,
 }
 
 OperatorBase::OperatorBase(const std::string& type,
-                           const OperatorBase::VarNameMap& inputs,
-                           const OperatorBase::VarNameMap& outputs,
+                           const VariableNameMap& inputs,
+                           const VariableNameMap& outputs,
                            const AttributeMap& attrs)
     : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
   static std::atomic<size_t> gUniqId(0UL);
@@ -141,18 +141,10 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
     }
     return ret_val;
   }
-  auto it = OpRegistry::op_info_map().find(type_);
-  PADDLE_ENFORCE(
-      it != OpRegistry::op_info_map().end(),
-      "Operator %s not registered, cannot figure out intermediate outputs",
-      type_);
-  PADDLE_ENFORCE(
-      it->second.proto_ != nullptr,
-      "Operator %s has no OpProto, cannot figure out intermediate outputs",
-      type_);
+  auto& info = OpInfoMap::Instance().Get(Type());
 
   // get all OpProto::Var for outputs
-  for (auto& o : it->second.proto_->outputs()) {
+  for (auto& o : info.Proto().outputs()) {
     // ignore all intermediate output
     if (o.intermediate()) continue;
     auto out = outputs_.find(o.name());
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 807298088981b969622174be753ea0da72067243..8397570d26f06f0238e9c5afc85d721df7679257 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "op_info.h"
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/scope.h"
@@ -62,10 +63,8 @@ class ExecutionContext;
  */
 class OperatorBase {
  public:
-  using VarNameMap = std::map<std::string, std::vector<std::string>>;
-
-  OperatorBase(const std::string& type, const VarNameMap& inputs,
-               const VarNameMap& outputs, const AttributeMap& attrs);
+  OperatorBase(const std::string& type, const VariableNameMap& inputs,
+               const VariableNameMap& outputs, const AttributeMap& attrs);
 
   virtual ~OperatorBase() {}
 
@@ -93,8 +92,8 @@ class OperatorBase {
   /// rename inputs outputs name
   void Rename(const std::string& old_name, const std::string& new_name);
 
-  const VarNameMap& Inputs() const { return inputs_; }
-  const VarNameMap& Outputs() const { return outputs_; }
+  const VariableNameMap& Inputs() const { return inputs_; }
+  const VariableNameMap& Outputs() const { return outputs_; }
   //! Get a input with argument's name described in `op_proto`
   const std::string& Input(const std::string& name) const;
   //! Get a input which has multiple variables.
@@ -122,30 +121,32 @@ class OperatorBase {
   // I (Inputs)opear
   // O (Outputs)
   // OG (Output Gradients)
-  VarNameMap inputs_;
+  VariableNameMap inputs_;
 
   // NOTE: in case of OpGrad, outputs_ contains
   // IG (Inputs Gradients)
-  VarNameMap outputs_;
+  VariableNameMap outputs_;
   AttributeMap attrs_;
 };
 
 // Macro for define a clone method.
 // If you are writing an kernel operator, `Clone` will be defined when you
 // register it. i.e. `Clone` method is not needed to define by yourself.
-#define DEFINE_OP_CLONE_METHOD(CLS)                       \
+#define DEFINE_OP_CLONE_METHOD(cls)                       \
   std::unique_ptr<OperatorBase> Clone() const final {     \
-    return std::unique_ptr<OperatorBase>(new CLS(*this)); \
+    return std::unique_ptr<OperatorBase>(new cls(*this)); \
   }
 
 // Macro for define a default constructor for Operator.
 // You can also use
 //   using PARENT_CLASS::PARENT_CLASS;
 // to use parent's constructor.
-#define DEFINE_OP_CONSTRUCTOR(CLS, PARENT_CLS)                                 \
-  CLS(const std::string& type, const VarNameMap& inputs,                       \
-      const VarNameMap& outputs, const paddle::framework::AttributeMap& attrs) \
-      : PARENT_CLS(type, inputs, outputs, attrs) {}
+#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)             \
+  cls(const std::string& type,                             \
+      const ::paddle::framework::VariableNameMap& inputs,  \
+      const ::paddle::framework::VariableNameMap& outputs, \
+      const paddle::framework::AttributeMap& attrs)        \
+      : parent_cls(type, inputs, outputs, attrs) {}
 
 class NOP : public OperatorBase {
  public:
@@ -389,8 +390,8 @@ class OperatorWithKernel : public OperatorBase {
   using OpKernelMap =
       std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
 
-  OperatorWithKernel(const std::string& type, const VarNameMap& inputs,
-                     const VarNameMap& outputs, const AttributeMap& attrs)
+  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
+                     const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void InferShape(const Scope& scope) const override {
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 2425b87779f6af01b0e8a91b5f574a28385f0efd..1d7efb7b9403f7c1c6bdbb27a0258f79ae032f43 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -23,8 +23,8 @@ static int op_run_num = 0;
 
 class OpWithoutKernelTest : public OperatorBase {
  public:
-  OpWithoutKernelTest(const std::string& type, const VarNameMap& inputs,
-                      const VarNameMap& outputs, const AttributeMap& attrs)
+  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
+                      const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs), x(1) {}
   void InferShape(const Scope& scope) const override {}
   void Run(const Scope& scope,
@@ -249,8 +249,9 @@ TEST(OpKernel, multi_inputs) {
 class OperatorClone : public paddle::framework::OperatorBase {
  public:
   DEFINE_OP_CLONE_METHOD(OperatorClone);
-  OperatorClone(const std::string& type, const VarNameMap& inputs,
-                const VarNameMap& outputs,
+  OperatorClone(const std::string& type,
+                const paddle::framework::VariableNameMap& inputs,
+                const paddle::framework::VariableNameMap& outputs,
                 const paddle::framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
   void InferShape(const paddle::framework::Scope& scope) const override {}
diff --git a/paddle/framework/pybind.cc b/paddle/framework/pybind.cc
index 4539a1903eb430eb0d76a787adb32984342a468d..b5ae81ebca1201cc84aba897ba86d6413405e036 100644
--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@@ -42,6 +42,8 @@ USE_OP(fill_zeros_like);
 USE_OP_ITSELF(recurrent_op);
 USE_OP(gaussian_random);
 USE_OP(uniform_random);
+USE_OP(scale);
+USE_OP_ITSELF(identity);
 USE_CPU_ONLY_OP(gather);
 
 namespace paddle {
@@ -139,19 +141,16 @@ All parameter, weight, gradient are variables in Paddle.
   //! @note: Be careful! PyBind will return std::string as an unicode, not
   //! Python str. If you want a str object, you should cast them in Python.
   m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
-    auto &op_info_map = OpRegistry::op_info_map();
     std::vector<py::bytes> ret_values;
-    for (auto it = op_info_map.begin(); it != op_info_map.end(); ++it) {
-      const OpProto *proto = it->second.proto_;
-      if (proto == nullptr) {
-        continue;
-      }
-      PADDLE_ENFORCE(proto->IsInitialized(), "OpProto must all be initialized");
+
+    OpInfoMap::Instance().IterAllInfo([&ret_values](const std::string &type,
+                                                    const OpInfo &info) {
+      if (!info.HasOpProtoAndChecker()) return;
       std::string str;
-      PADDLE_ENFORCE(proto->SerializeToString(&str),
+      PADDLE_ENFORCE(info.Proto().SerializeToString(&str),
                      "Serialize OpProto Error. This could be a bug of Paddle.");
-      ret_values.push_back(py::bytes(str));
-    }
+      ret_values.emplace_back(str);
+    });
     return ret_values;
   });
   m.def_submodule(
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index b8c779f4e5fc7bc51298cdd35b26c2c8ac98edf6..643f875491724bf443bd7727391734377ee6180c 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -105,7 +105,10 @@ class Tensor {
   template <typename T>
   inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
 
-  platform::Place place() const { return holder_->place(); }
+  platform::Place place() const {
+    PADDLE_ENFORCE_NOT_NULL(holder_, "Tensor get place() must contains holder");
+    return holder_->place();
+  }
 
  private:
   template <typename T>
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 157b1ab45163a94a81d859dbcb7a52ae8edae439..1829f72a87054d9e4ead97962ca1f6738e585787 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -1012,11 +1012,6 @@ void RecurrentGradientMachine::generateSequence() {
                            /* width */ resultNum,
                            false,
                            /* useGpu */ false);
-    Matrix::resizeOrCreate(generator_.outArg.value,
-                           /* height */ maxGenWordCount,
-                           /* width */ 1,
-                           false,
-                           /* useGpu */ false);
   }
   ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
                                 numSequences + 1,
@@ -1026,7 +1021,7 @@ void RecurrentGradientMachine::generateSequence() {
   } else {
     oneWaySearch(numSequences);
   }
-  if (dataArgsSize_) createDataOutlink(batchMachineIdVec_);
+  if (dataArgsSize_) createDataOutlink();
 
   size_t size = generator_.ids.size();
   generator_.outArg.ids->resize(size);
@@ -1106,6 +1101,7 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
   }
 
   batchMachineIdVec_.clear();
+  batchMachineStartPos_.clear();
   int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
   starts[0] = 0;
   generator_.ids.clear();
@@ -1312,13 +1308,20 @@ void RecurrentGradientMachine::fillGenOutputs() {
     finalPaths_[i].resize(minFinalPathsSize);
   }
 
-  batchMachineIdVec_.clear();
   generator_.ids.clear();
   int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
   starts[0] = 0;
   if (numResults > 1) {
-    real* probs = generator_.outArg.in->getData();
+    int idsProbSaveSize = 0;
+    for (auto inSeq : finalPaths_) {
+      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
+      idsProbSaveSize += inSeq.size();
+    }
+    Matrix::resizeOrCreate(
+        generator_.outArg.value, idsProbSaveSize, 1, false, false);
     real* idsProb = generator_.outArg.value->getData();
+
+    real* probs = generator_.outArg.in->getData();
     size_t curPos = 0;
     for (size_t i = 0; i < finalPaths_.size(); ++i) {
       for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
@@ -1333,24 +1336,16 @@ void RecurrentGradientMachine::fillGenOutputs() {
         curPos += genLen;
         idsProb[curPos++] = -1.0;
         probs[i * numResults + j] = path.logProb;
-
-        if (!j && dataArgsSize_) {
-          // in beam search, here only reserved the top 1 generated result
-          // for out_links that are not the generated word indices.
-          batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                                    path.machineIdVec.begin(),
-                                    path.machineIdVec.end());
-        }
       }
       starts[i + 1] = generator_.ids.size();
     }
   } else {
     for (size_t i = 0; i < finalPaths_.size(); ++i) {
       CHECK(!finalPaths_[i].empty());
-      generator_.ids.insert(generator_.ids.begin(),
-                            finalPaths_[i][0].ids.begin(),
-                            finalPaths_[i][0].ids.end());
-      starts[i + 1] = starts[i] + finalPaths_[i][0].ids.size();
+      Path& path = finalPaths_[i][0];
+      generator_.ids.insert(
+          generator_.ids.begin(), path.ids.begin(), path.ids.end());
+      starts[i + 1] = starts[i] + path.ids.size();
     }
   }
 }
@@ -1364,25 +1359,76 @@ void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
   }
 }
 
-void RecurrentGradientMachine::createDataOutlink(
-    std::vector<int>& machineIdVec) {
-  size_t seqNum =
-      getBeamSize() > 1UL ? finalPaths_.size() : finalPaths_[0].size();
-  std::vector<int> starts(seqNum + 1, 0);
-  for (size_t i = 0; i < seqNum; ++i) {
-    size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size()
-                                        : finalPaths_[0][i].ids.size();
-    starts[i + 1] = starts[i] + seqLen;
+void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
+    bool isSeq, std::vector<Argument>& outArgs) {
+  batchMachineIdVec_.clear();
+
+  size_t seqIdx = 0;
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
+    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
+      if (isSeq) {
+        for (size_t i = 0; i < machineIdVec.size(); ++i) {
+          size_t rowId = machineIdVec[i];
+          int* seqPos =
+              outArgs[i].sequenceStartPositions->getMutableData(false);
+          batchMachineIdVec_.push_back(seqPos[rowId]);
+        }
+      } else {
+        batchMachineIdVec_.insert(
+            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
+      }
+      seqIdx++;
+    }
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
+    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
+  size_t totalSeqNum = std::accumulate(
+      finalPaths_.begin(),
+      finalPaths_.end(),
+      0UL,
+      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
+  copySize.resize(totalSeqNum, 1);
+
+  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
+  if (isSeq) {
+    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
+    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
+             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
+    int* starts = inputSeqStartPos->getMutableData(false);
+    int seqId = 0;
+    for (int i = 0; i < finalPaths_.size(); ++i) {
+      for (int j = 0; j < finalPaths_[i].size(); ++j) {
+        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
+                                            : starts[j + 1] - starts[j];
+        batchMachineStartPos_[seqId + 1] =
+            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
+        seqId++;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
+      batchMachineStartPos_[i + 1] =
+          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
   }
+}
 
+void RecurrentGradientMachine::createDataOutlink() {
   for (size_t i = 0; i < dataArgsSize_; i++) {
+    bool isSeq = dataArgsFrame_[i][0].hasSeq();
+    std::vector<int> copySize;
+    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
+    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
+
     dataArgs_[i].concat(dataArgsFrame_[i],
-                        machineIdVec,
-                        starts,
+                        batchMachineIdVec_,
+                        batchMachineStartPos_,
+                        copySize,
                         useGpu_,
                         HPPL_STREAM_1,
                         PASS_TEST);
-
     auto dataAgent =
         dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
     CHECK_NOTNULL(dataAgent);
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index fb3fc5877ac96323e891f800db80af83b6809831..c16fae6d1770e616fdcfabd440624c9be9753c91 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -190,7 +190,7 @@ public:
     std::vector<int> ids;
 
     /**
-     * @brief idsProb, log probability of each generated words.
+     * @brief idsProb, log probability of each generated word.
      */
     std::vector<real> idsProb;
 
@@ -472,15 +472,43 @@ private:
   void copyDataOutlinkFrame(size_t machineCur);
 
   /*
-   * @brief In generation, if the layer group has more than 1 outlink, outlinks
-   * except the first one are data outlinks. This function creates the data
-   * outlinks.
-   * @note In beam search, only one generated sequence with the hightest log
-   * probabilites are retained.
-   * @param machineIdVec : select a row of output matrix in each frame
-   * that the generation process expanded.
+   * @brief In generation, if the layer group has more than 1 outlink, outlink
+   * except the first one is a data outlink. In RecurrentLayerGroup, each time
+   * step is a separate Network, outputs of a layer inside the
+   * RecurrentLayerGroup are stored in separate Arguments. If one layer is
+   * specified as an outlink of RecurrentLayerGroup. This function will
+   * collect outputs in each time step of each generated sequence which are
+   * dispersed in separate Arguments to form a new single Argument as output of
+   * RecurrentLayerGroup.
    */
-  void createDataOutlink(std::vector<int>& machineIdVec);
+  void createDataOutlink();
+
+  /*
+   * @brief decide to select how many rows from the Matrix stored the forward
+   * pass results from a start position.
+   *
+   * @param isSeq: a flag indicating whetehr the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the the returned Arguments of the forward pass
+   * during the generation process.
+   * @param copySize: the returned result, number of rows to select from the
+   * Matrix stored the forward pass results from a start position.
+   */
+  void createDataOutlinkCopySizeInfo(bool isSeq,
+                                     std::vector<Argument>& outArgs,
+                                     std::vector<int>& copySize);
+
+  /*
+   * @brief decide index of the start row for each time step of a generated
+   * sequence in Matrix stored the entire beam search batch's forward pass
+   * results.
+   *
+   * @param isSeq: a flag indicating whether the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the returned Arguments of the forward pass
+   * during the generation process.
+   */
+  void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);
 
   /*
    * @brief used in beam search, connect previous frame to form recurrent link
@@ -543,6 +571,7 @@ private:
   std::vector<int> topIds_;
   std::vector<int> seqIds_;
   std::vector<int> batchMachineIdVec_;
+  std::vector<int> batchMachineStartPos_;
   std::vector<std::vector<Path>> finalPaths_;
   std::vector<real> minFinalPathLogProb_;
   BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index ba1362e8bf38ef4735ffeea29bea12f6eff99982..58e9d594c40b130f7fd37ecc1a48b6ba0152669e 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -69,3 +69,4 @@ op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
     DEPS framework_proto tensor op_registry operator net_op)
 op_library(uniform_random_op
         SRCS uniform_random_op.cc uniform_random_op.cu)
+op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
diff --git a/paddle/operators/net_op.cc b/paddle/operators/net_op.cc
index a7d710511093dfbe13a13b1222b0230bba0398bd..44d925f0b0cc5ff20d52e548816f118c2027343a 100644
--- a/paddle/operators/net_op.cc
+++ b/paddle/operators/net_op.cc
@@ -68,10 +68,15 @@ std::string NetOp::DebugString() const {
 bool NetOp::IsNetOp() const { return true; }
 
 std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
+  std::vector<std::string> all;
+  for (auto& pair : this->outputs_) {
+    for (auto& var_name : pair.second) {
+      all.push_back(var_name);
+    }
+  }
   if (has_intermediate) {
-    return this->outputs_.at(kAll);
+    return all;
   }
-  auto& all = this->outputs_.at(kAll);
   std::vector<std::string> ret_val;
   for (auto& each : all) {
     if (!Contains(intermediate_outputs_, each)) {
@@ -81,9 +86,8 @@ std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
   return ret_val;
 }
 
-NetOp::NetOp(const std::string& type,
-             const framework::OperatorBase::VarNameMap& inputs,
-             const framework::OperatorBase::VarNameMap& outputs,
+NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
+             const framework::VariableNameMap& outputs,
              const framework::AttributeMap& attrs)
     : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
index 3d3f996ef52b6c1136425ca9de0f60e7e155458f..fcd8134b2c19cae6a4d006a4cd6fe32d2d627c34 100644
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -38,8 +38,10 @@ class NetOp : public framework::OperatorBase {
  public:
   static const char kAll[];
   NetOp() : framework::OperatorBase("plain_net", {}, {}, {}) {}
-  NetOp(const std::string& type, const VarNameMap& inputs,
-        const VarNameMap& outputs, const framework::AttributeMap& attrs);
+
+  NetOp(const std::string& type, const framework::VariableNameMap& inputs,
+        const framework::VariableNameMap& outputs,
+        const framework::AttributeMap& attrs);
 
   NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
     this->ops_.reserve(o.ops_.size());
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 78ce0ba3c0fa4fe380e49a848c2434fe593cd00b..16bd249cb3d989c695ec9378f09d48833d70be58 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -131,8 +131,8 @@ const rnn::ArgumentName RecurrentGradientOp::kArgName{
     "memories",    "pre_memories", "boot_memories@grad"};
 
 RecurrentOp::RecurrentOp(const std::string& type,
-                         const framework::OperatorBase::VarNameMap& inputs,
-                         const framework::OperatorBase::VarNameMap& outputs,
+                         const framework::VariableNameMap& inputs,
+                         const framework::VariableNameMap& outputs,
                          const framework::AttributeMap& attrs)
     : OperatorBase(type, inputs, outputs, attrs) {
   rnn::InitArgument(kArgName, &arg_, *this);
@@ -223,8 +223,8 @@ void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
 }
 
 RecurrentGradientOp::RecurrentGradientOp(
-    const std::string& type, const framework::OperatorBase::VarNameMap& inputs,
-    const framework::OperatorBase::VarNameMap& outputs,
+    const std::string& type, const framework::VariableNameMap& inputs,
+    const framework::VariableNameMap& outputs,
     const framework::AttributeMap& attrs)
     : OperatorBase(type, inputs, outputs, attrs) {
   rnn::InitArgument(kArgName, &arg_, *this);
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
index bcfa817de8242153b164fa091309f19a6ad8a246..1033d657a3a8f96c8b3dae8dd93d3f1f6840b59b 100644
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -114,8 +114,9 @@ class RecurrentGradientAlgorithm {
 
 class RecurrentOp : public framework::OperatorBase {
  public:
-  RecurrentOp(const std::string& type, const VarNameMap& inputs,
-              const VarNameMap& outputs, const framework::AttributeMap& attrs);
+  RecurrentOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs);
 
   RecurrentOp(const RecurrentOp& o)
       : framework::OperatorBase(
@@ -150,8 +151,9 @@ class RecurrentOp : public framework::OperatorBase {
 
 class RecurrentGradientOp : public framework::OperatorBase {
  public:
-  RecurrentGradientOp(const std::string& type, const VarNameMap& inputs,
-                      const VarNameMap& outputs,
+  RecurrentGradientOp(const std::string& type,
+                      const framework::VariableNameMap& inputs,
+                      const framework::VariableNameMap& outputs,
                       const framework::AttributeMap& attrs);
 
   RecurrentGradientOp(const RecurrentGradientOp& o)
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e96a74c94ab7ff4d8c3266695e5157aff67905b
--- /dev/null
+++ b/paddle/operators/scale_op.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/scale_op.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ScaleOp : public framework::OperatorWithKernel {
+ public:
+  ScaleOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto *in = ctx.Input<framework::Tensor>("X");
+    auto *out = ctx.Output<framework::Tensor>("Out");
+    out->Resize(in->dims());
+  }
+};
+
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddComment(R"DOC(Scale operator
+
+The equation is: Out = scale*X
+)DOC");
+    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+  }
+};
+
+// Identity Op's gradient is identity op, too.
+// Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
+template <typename AttrType>
+class ScaleGradOp : public NetOp {
+ public:
+  ScaleGradOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    AppendOp(framework::OpRegistry::CreateOp(
+        "scale", {{"X", {Input(framework::GradVarName("Out"))}}},
+        {{"Out", {Output(framework::GradVarName("X"))}}},
+        {{"scale", GetAttr<AttrType>("scale")}}));
+    CompleteAddOp(false);
+  }
+};
+
+// identity is a alias of scale op. This is also a example for creating a alias
+// operator.
+template <typename AttrType>
+class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IdentityOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input tensor of identity op");
+    AddOutput("Out", "output tensor of identity op");
+    AddComment("identity operator. Just a alias of scale op which scale = 1.0");
+  }
+};
+
+template <typename AttrType>
+class IdentityOp : public NetOp {
+ public:
+  IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    AppendOp(framework::OpRegistry::CreateOp(
+        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
+        {{"scale", static_cast<AttrType>(1)}}));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, scale_grad,
+            ops::ScaleGradOp<float>);
+REGISTER_OP_CPU_KERNEL(scale,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
+                             ops::IdentityOpMaker<float>);
diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..63efbe0da8a90dd237d2d692076075339179acf6
--- /dev/null
+++ b/paddle/operators/scale_op.cu
@@ -0,0 +1,18 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/scale_op.h"
+
+REGISTER_OP_GPU_KERNEL(
+    scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..aea64f1b0428ffe79ba8d90cf79dbfd2b5ef36f4
--- /dev/null
+++ b/paddle/operators/scale_op.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename Place, typename T, typename AttrType = T>
+class ScaleKernel : public framework::OpKernel {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    tensor->mutable_data<T>(in->place());
+
+    auto scale = static_cast<T>(context.op_.GetAttr<AttrType>("scale"));
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& dev = context.GetEigenDevice<Place>();
+    eigen_out.device(dev) = scale * eigen_in;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 0547ac93cd183afbcede41d280c6b4b16ed7dab1..79d2158334269be2c644c74b202724fabc21a07b 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -276,17 +276,21 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
 void Argument::concat(const std::vector<Argument>& args,
                       const std::vector<int>& selectRows,
                       const std::vector<int>& seqStartPos,
+                      const std::vector<int>& copySize,
                       bool useGpu,
                       hl_stream_t stream,
                       PassType passType) {
   CHECK(!subSequenceStartPositions)
       << "undefined behavior for subsequence positions";
 
-  size_t batchSize = selectRows.size();
+  size_t batchSize = 0;
+  for (size_t i = 0; i < copySize.size(); ++i)
+    batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
+
   auto copyArg = [batchSize, stream](MatrixPtr& dst,
                                      MatrixPtr src,
-                                     int startRow,
-                                     int pos,
+                                     int desStartRow,
+                                     int srcStartRow,
                                      int size,
                                      bool useGpu) {
     if (!src) {
@@ -300,14 +304,14 @@ void Argument::concat(const std::vector<Argument>& args,
       dst->resize(batchSize, width);
     }
 
-    MatrixPtr tmpMatrix = dst->subMatrix(startRow, size);
-    tmpMatrix->copyFrom(*src->subMatrix(pos, size), stream);
+    MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
+    tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
   };
 
   auto copyIds = [batchSize, stream](IVectorPtr& dst,
                                      const IVectorPtr& src,
-                                     int startRow,
-                                     int pos,
+                                     int desStartRow,
+                                     int srcStartRow,
                                      int size,
                                      bool useGpu) {
     if (!src) {
@@ -315,13 +319,14 @@ void Argument::concat(const std::vector<Argument>& args,
       return;
     }
     IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(startRow, size)->copyFrom(*src->subVec(pos, size), stream);
+    dst->subVec(desStartRow, size)
+        ->copyFrom(*src->subVec(srcStartRow, size), stream);
   };
 
   auto copyStrs = [batchSize, stream](SVectorPtr& dst,
                                       const SVectorPtr& src,
-                                      int startRow,
-                                      int pos,
+                                      int desStartRow,
+                                      int srcStartRow,
                                       int size,
                                       bool useGpu) {
     if (!src) {
@@ -333,30 +338,31 @@ void Argument::concat(const std::vector<Argument>& args,
     } else {
       dst->resize(batchSize);
     }
-    std::copy(
-        src->begin() + pos, src->begin() + pos + size, dst->begin() + startRow);
+    std::copy(src->begin() + srcStartRow,
+              src->begin() + srcStartRow + size,
+              dst->begin() + desStartRow);
   };
 
   dataId = args[0].dataId;
   CHECK_NE(seqStartPos.size(), 0UL);
-  size_t sampleNum = seqStartPos.size() - 1;
-  for (size_t i = 0; i < sampleNum; ++i) {
+  int desStartRow = 0;
+  for (size_t i = 0; i < copySize.size(); ++i) {
     int startPos = seqStartPos[i];
     int endPos = seqStartPos[i + 1];
     CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
     for (int j = startPos; j < endPos; ++j) {
       const Argument& arg = args[j - startPos];
-      CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
-                                   << " same dataId";
-      const int copySize = 1;
-      const int rowIdx = selectRows[j];
-      copyArg(in, arg.in, j, rowIdx, copySize, useGpu);
-      copyArg(value, arg.value, j, rowIdx, copySize, useGpu);
+      CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
+                                   << "the same dataId.";
+      const int srcStartRow = selectRows[j];
+      copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
       if (passType != PASS_TEST) {
-        copyArg(grad, arg.grad, j, rowIdx, copySize, useGpu);
+        copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
       }
-      copyIds(ids, arg.ids, j, rowIdx, copySize, useGpu);
-      copyStrs(strs, arg.strs, j, rowIdx, copySize, useGpu);
+      copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
+      desStartRow += copySize[i];
     }
   }
   ICpuGpuVector::resizeOrCreate(
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index d8d7a4398f99a2794c5d25528a7d582f5ed629ba..38797a76f55c311070192bd307103143d67cabca 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -240,6 +240,7 @@ struct Argument {
   void concat(const std::vector<Argument>& args,
               const std::vector<int>& selectRows,
               const std::vector<int>& seqStartPos,
+              const std::vector<int>& copySize,
               bool useGpu,
               hl_stream_t stream,
               PassType passType);
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 8a2b7c54d3ef481712bef1e1a39fb336b23eb1b2..3f4110e4a9de796140af9703559937338d27f251 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -28,3 +28,4 @@ py_test(test_uniform_random_op SRCS test_uniform_random_op.py)
 py_test(test_recurrent_op SRCS test_recurrent_op.py)
 py_test(test_sgd_op SRCS test_sgd_op.py)
 py_test(test_gradient_checker SRCS test_gradient_checker.py)
+py_test(test_scale_and_identity_op SRCS test_scale_and_identity_op.py)
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index 8b8e2f444be1169c23784321721c5d8154541fcf..c22c6f8831b2551d9a83747bc0d15789a78a101e 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -160,8 +160,13 @@ class GradientChecker(unittest.TestCase):
             grad_tensor.set(data, place)
 
         # run backward op
-        for name in backward_op.outputs():
+        backward_outs = backward_op.outputs()
+        backward_names = [
+            item for key in backward_outs for item in backward_outs[key]
+        ]
+        for name in backward_names:
             scope.new_var(name)
+
         backward_op.infer_shape(scope)
         backward_op.run(scope, ctx)
 
diff --git a/python/paddle/v2/framework/tests/test_scale_and_identity_op.py b/python/paddle/v2/framework/tests/test_scale_and_identity_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..69b301c376ee7a4ebb2e2dadc645c7d10f823a08
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_scale_and_identity_op.py
@@ -0,0 +1,43 @@
+import unittest
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+import numpy as np
+from paddle.v2.framework.op import Operator
+
+
+class IdentityTest(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "identity"
+        self.inputs = {'X': np.random.random((32, 784)).astype("float32")}
+        self.outputs = {'Out': self.inputs['X']}
+
+
+class IdentityGradOpTest(GradientChecker):
+    def test_normal(self):
+        op = create_op("identity")
+        inputs = {"X": np.random.random((10, 10)).astype("float32")}
+        self.check_grad(op, inputs, set("X"), "Out")
+
+
+class ScaleTest(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "scale"
+        self.inputs = {'X': np.random.random((32, 784)).astype("float32")}
+        self.attrs = {'scale': -2.3}
+        self.outputs = {'Out': self.inputs['X'] * self.attrs['scale']}
+
+
+class ScaleGradTest(GradientChecker):
+    def test_normal(self):
+        op = Operator("scale", X="X", Out="Out", scale=3.2)
+        self.check_grad(op,
+                        {"X": np.random.random((10, 10)).astype("float32")},
+                        set("X"), "Out")
+
+
+if __name__ == '__main__':
+    unittest.main()