Merge branch 'develop' into huber_loss

75961918 · Luo Tao · 73ab2d46 · b9a4b2ee · 75961918 · 75961918
105 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
+option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -28,6 +28,10 @@ if(NOT WITH_TIMER)
    add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
+if(USE_EIGEN_FOR_BLAS)
+    add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
+endif(USE_EIGEN_FOR_BLAS)
 if(NOT WITH_PROFILER)
    add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -2,7 +2,7 @@ if(NOT WITH_GPU)
    return()
 endif()
-set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
+set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
 find_path(CUDNN_INCLUDE_DIR cudnn.h
    PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
    $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -257,6 +257,11 @@ seq_concat
 ..  autoclass:: paddle.v2.layer.seq_concat
    :noindex:
+seq_slice
+---------
+..  autoclass:: paddle.v2.layer.seq_slice
+    :noindex:
 kmax_sequence_score
 -------------------
 ..  autoclass:: paddle.v2.layer.kmax_sequence_score
@@ -362,6 +367,11 @@ trans
 ..  autoclass:: paddle.v2.layer.trans
    :noindex:
+scale_shift
+-----------
+..  autoclass:: paddle.v2.layer.scale_shift
+    :noindex:
 Sampling Layers
 ===============

--- a/go/master/client.go
+++ b/go/master/client.go
@@ -63,13 +63,24 @@ func WithAddr(addr string) func(c *Client) error {
 // WithEtcd sets the client to use etcd for master discovery.
 func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
 	return func(c *Client) error {
-		cli, err := clientv3.New(clientv3.Config{
+		var cli *clientv3.Client
-			Endpoints:   endpoints,
+		f := func() error {
-			DialTimeout: timeout,
+			var err error
-		})
+			cli, err = clientv3.New(clientv3.Config{
-		if err != nil {
+				Endpoints:   endpoints,
+				DialTimeout: timeout,
+			})
 			return err
 		}
+		for {
+			err := f()
+			if err != nil {
+				log.Warningln(err)
+			} else {
+				break
+			}
+			time.Sleep(time.Second)
+		}
 		ch := make(chan string, 1)
 		a, err := GetKey(cli, DefaultAddrPath, timeout)
@@ -101,9 +112,6 @@ func NewClient(opts ...func(*Client) error) (*Client, error) {
 		}
 	}
 	c.ch = make(chan record, c.bufSize)
-	// FIXME: connection is created asyncrosly in monitorMaster go routine,
-	//        ensure the connection is ready for use before calling c.addClient.
-	time.Sleep(time.Second)
 	return c, nil
 }

--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -15,6 +15,7 @@ if(Boost_FOUND)
  add_subdirectory(platform)
  add_subdirectory(framework)
  add_subdirectory(operators)
+  add_subdirectory(pybind)
 endif()
 if(WITH_C_API)

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -18,8 +18,8 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
+cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
-cc_library(operator SRCS operator.cc DEPS framework_proto device_context tensor scope attribute)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
@@ -39,21 +39,3 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
-if(WITH_PYTHON)
-cc_library(paddle_pybind SHARED
-    SRCS pybind.cc
-    DEPS pybind python backward
-    sgd_op
-    add_op
-    mul_op
-    rowwise_add_op
-    sigmoid_op
-    softmax_op
-    mean_op
-    cross_entropy_op
-    recurrent_op
-    uniform_random_op
-    gaussian_random_op
-    fill_zeros_like_op)
-endif(WITH_PYTHON)
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -110,7 +110,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
                       dup_output_ops[out].emplace_back(local_op_id);
                       return false;
                     });
-      net->AddOp(std::move(bwd));
+      net->AppendOp(std::move(bwd));
    }
    // Get unique ID for this method.
    auto uid = uniq_id++;
@@ -163,8 +163,9 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
        // If part of input gradient of that operator is not calculated, fill
        // zero variables to that input gradient.
-        net->AddOp(OpRegistry::CreateOp("fill_zeros_like", {{"Src", {prefix}}},
+        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like",
-                                        {{"Dst", {grad_input}}}, {}));
+                                           {{"Src", {prefix}}},
+                                           {{"Dst", {grad_input}}}, {}));
      }
      return false;
    });
@@ -195,7 +196,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
    if (net->ops_.empty()) {  // Current no aux op is added to network
      return grad_op;
    }
-    net->AddOp(std::move(grad_op));
+    net->AppendOp(std::move(grad_op));
  }
  net->SetType("@GENERATED_BACKWARD@");
  net->CompleteAddOp();

--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -72,16 +72,16 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker {
 class FcOp : public operators::NetOp {
 public:
-  FcOp(const std::string &type, const VarNameMap &inputs,
+  FcOp(const std::string &type, const VariableNameMap &inputs,
-       const VarNameMap &outputs, const AttributeMap &attrs)
+       const VariableNameMap &outputs, const AttributeMap &attrs)
      : NetOp(type, inputs, outputs, attrs) {
-    AddOp(OpRegistry::CreateOp("mul",
+    AppendOp(OpRegistry::CreateOp("mul",
-                               {{"X", {Input("X")}}, {"Y", {Input("W")}}},
+                                  {{"X", {Input("X")}}, {"Y", {Input("W")}}},
-                               {{"Out", {Output("mul_result")}}}, {}));
+                                  {{"Out", {Output("mul_result")}}}, {}));
    auto input_b = Inputs("b");
    std::string before_act = "mul_result";
    if (input_b.size() != 0) {
-      AddOp(OpRegistry::CreateOp(
+      AppendOp(OpRegistry::CreateOp(
          "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
          {{"Out", {Output("add_result")}}}, {}));
      before_act = "add_result";
@@ -92,8 +92,8 @@ class FcOp : public operators::NetOp {
      }
    }
-    AddOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
+    AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
-                               {{"Out", {Output("Out")}}}, {}));
+                                  {{"Out", {Output("Out")}}}, {}));
    CompleteAddOp(false);
  }
 };
@@ -234,13 +234,13 @@ TEST(Backward, net_fc_backward_not_have_b) {
 TEST(Backward, net_input_of_network_not_need_grad) {
  ops::NetOp net;
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
      "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
      {{"mul_result", {"mul_tmp_0"}},
       {"add_result", {"add_tmp_0"}},
       {"Out", {"hidden0"}}},
      {}));
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
      "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
      {{"mul_result", {"mul_tmp_1"}},
       {"add_result", {"add_tmp_1"}},
@@ -273,10 +273,10 @@ TEST(Backward, net_input_of_network_not_need_grad) {
 TEST(Backward, net_shared_weight) {
  ops::NetOp net;
-  net.AddOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
+  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
-                                    {{"Out", {"out"}}}, {}));
+                                       {{"Out", {"out"}}}, {}));
-  net.AddOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
+  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
-                                    {{"Out", {"FinalOut"}}}, {}));
+                                       {{"Out", {"FinalOut"}}}, {}));
  net.CompleteAddOp();
  auto bwd = f::Backward(net, {});
@@ -357,19 +357,19 @@ TEST(Backward, op_part_of_input_are_not_need) {
 TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
  ops::NetOp net;
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
      "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
      {{"mul_result", {"mul_out1"}},
       {"add_result", {"add_out1"}},
       {"Out", {"out1"}}},
      {}));
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
      "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
      {{"mul_result", {"mul_out2"}},
       {"add_result", {"tmp_out2"}},
       {"Out", {"out2"}}},
      {}));
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
      "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
      {{"mul_result", {"mul_out3"}},
       {"add_result", {"tmp_out3"}},

--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@@ -20,13 +20,13 @@ namespace framework {
 enum class OpArgType { IN, OUT };
 static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
-                       bool is_grad, OperatorBase::VarNameMap* vars) {
+                       bool is_grad, VariableNameMap* vars) {
  const auto& src_inout =
      src_type == OpArgType::IN ? src_op->Inputs() : src_op->Outputs();
  auto& dst_inout = *vars;
-  const OpProto* proto = OpRegistry::op_info_map().at(src_op->Type()).proto_;
+  auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
  const auto& src_arg_list =
-      src_type == OpArgType::IN ? proto->inputs() : proto->outputs();
+      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
  for (const auto& arg : src_arg_list) {
    if (arg.not_in_gradient() && !is_grad) continue;
    const std::string src_name = arg.name();
@@ -40,26 +40,18 @@ static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
 }
 OperatorBase* BuildGradOp(const OperatorBase* op) {
-  auto it = OpRegistry::op_info_map().find(op->Type());
+  auto& info = OpInfoMap::Instance().Get(op->Type());
-  PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(),
+  PADDLE_ENFORCE(info.HasGradientOp());
-                 "'%s' has not been registered.", op->Type());
-  PADDLE_ENFORCE(it->second.proto_ != nullptr, "'%s' has no OpProto.",
-                 op->Type());
-  std::string grad_op_type = it->second.grad_op_type_;
-  PADDLE_ENFORCE(!grad_op_type.empty(), "'%s' has no gradient operator.",
-                 op->Type());
-  OperatorBase::VarNameMap inputs;
+  VariableNameMap inputs;
-  OperatorBase::VarNameMap outputs;
+  VariableNameMap outputs;
  TransOpArg(op, OpArgType::IN, false, &inputs);   // I
  TransOpArg(op, OpArgType::OUT, false, &inputs);  // O
  TransOpArg(op, OpArgType::OUT, true, &inputs);   // OG
  TransOpArg(op, OpArgType::IN, true, &outputs);   // IG
-  it = OpRegistry::op_info_map().find(grad_op_type);
+  auto& grad_info = OpInfoMap::Instance().Get(info.grad_op_type_);
-  PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(),
+  return grad_info.Creator()(info.grad_op_type_, inputs, outputs, op->Attrs());
-                 "'%s' has not been registered.", grad_op_type);
-  return it->second.creator_(grad_op_type, inputs, outputs, op->Attrs());
 }
 }  // namespace framework

--- a/paddle/framework/op_info.cc
+++ b/paddle/framework/op_info.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/op_info.h"
+namespace paddle {
+namespace framework {
+static OpInfoMap* g_op_info_map = nullptr;
+OpInfoMap& OpInfoMap::Instance() {
+  if (g_op_info_map == nullptr) {
+    g_op_info_map = new OpInfoMap();
+  }
+  return *g_op_info_map;
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <functional>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include "paddle/framework/attribute.h"
+namespace paddle {
+namespace framework {
+class OperatorBase;
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+using OpCreator = std::function<OperatorBase*(
+    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
+    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
+struct OpInfo {
+  OpCreator creator_;
+  std::string grad_op_type_;
+  OpProto* proto_;
+  OpAttrChecker* checker_;
+  bool HasOpProtoAndChecker() const {
+    return proto_ != nullptr && checker_ != nullptr;
+  }
+  const OpProto& Proto() const {
+    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
+    PADDLE_ENFORCE(proto_->IsInitialized(),
+                   "Operator Proto must be initialized in op info");
+    return *proto_;
+  }
+  const OpAttrChecker& Checker() const {
+    PADDLE_ENFORCE_NOT_NULL(checker_,
+                            "Operator Checker has not been registered");
+    return *checker_;
+  }
+  const OpCreator& Creator() const {
+    PADDLE_ENFORCE_NOT_NULL(creator_,
+                            "Operator Creator has not been registered");
+    return creator_;
+  }
+  bool HasGradientOp() const { return !grad_op_type_.empty(); }
+};
+class OpInfoMap {
+ public:
+  static OpInfoMap& Instance();
+  OpInfoMap(const OpInfoMap& o) = delete;
+  OpInfoMap(OpInfoMap&& o) = delete;
+  OpInfoMap& operator=(const OpInfoMap& o) = delete;
+  OpInfoMap& operator=(OpInfoMap&& o) = delete;
+  bool Has(const std::string& op_type) const {
+    return map_.find(op_type) != map_.end();
+  }
+  void Insert(const std::string& type, const OpInfo& info) {
+    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
+    map_.insert({type, info});
+  }
+  const OpInfo& Get(const std::string& type) const {
+    auto it = map_.find(type);
+    PADDLE_ENFORCE(it != map_.end(), "Operator %s are not found", type);
+    return it->second;
+  }
+  template <typename Callback>
+  void IterAllInfo(Callback callback) {
+    for (auto& it : map_) {
+      callback(it.first, it.second);
+    }
+  }
+ private:
+  OpInfoMap() = default;
+  std::unordered_map<std::string, const OpInfo> map_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -19,32 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const std::string& type,
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
-                                                   const VarNameMap& inputs,
+    const std::string& type, const VariableNameMap& inputs,
-                                                   const VarNameMap& outputs,
+    const VariableNameMap& outputs, AttributeMap attrs) {
-                                                   AttributeMap attrs) {
+  auto& info = OpInfoMap::Instance().Get(type);
-  auto it = op_info_map().find(type);
+  info.Checker().Check(attrs);
-  PADDLE_ENFORCE(it != op_info_map().end(),
+  auto op = info.Creator()(type, inputs, outputs, attrs);
-                 "Operator '%s' has not been registered.", type);
-  it->second.checker_->Check(attrs);
-  auto op = it->second.creator_(type, inputs, outputs, attrs);
  return std::unique_ptr<OperatorBase>(op);
 }
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+static VariableNameMap ConvertOpDescVarsToVarNameMap(
-  VarNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
-  VarNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
-  AttributeMap attrs;
-  for (auto& attr : op_desc.attrs()) {
-    attrs[attr.name()] = GetAttrValue(attr);
-  }
-  return CreateOp(op_desc.type(), inputs, outputs, attrs);
-}
-OperatorBase::VarNameMap OpRegistry::ConvertOpDescVarsToVarNameMap(
    const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
-  VarNameMap ret_val;
+  VariableNameMap ret_val;
  for (auto& var : op_desc_vars) {
    auto& var_names = ret_val[var.parameter()];
    auto& var_names_in_proto = var.arguments();
@@ -55,6 +41,17 @@ OperatorBase::VarNameMap OpRegistry::ConvertOpDescVarsToVarNameMap(
  return ret_val;
 }
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+  VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
+  VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
+  AttributeMap attrs;
+  for (auto& attr : op_desc.attrs()) {
+    attrs[attr.name()] = GetAttrValue(attr);
+  }
+  return CreateOp(op_desc.type(), inputs, outputs, attrs);
+}
 std::unique_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) {
  PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops");
  return std::unique_ptr<OperatorBase>(BuildGradOp(&op));

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/op_info.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
@@ -30,28 +31,16 @@ namespace paddle {
 namespace framework {
 class OpRegistry {
-  using VarNameMap = OperatorBase::VarNameMap;
-  using OpCreator = std::function<OperatorBase*(
-      const std::string& /*type*/, const VarNameMap& /*inputs*/,
-      const VarNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 public:
-  struct OpInfo {
-    OpCreator creator_;
-    std::string grad_op_type_;
-    OpProto* proto_;
-    OpAttrChecker* checker_;
-  };
  template <typename OpType, typename ProtoMakerType, typename GradOpType>
  static void RegisterOp(const std::string& op_type,
                         const std::string& grad_op_type) {
-    PADDLE_ENFORCE(op_info_map().count(op_type) == 0,
+    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
                   "'%s' is registered more than once.", op_type);
    OpInfo op_info;
-    op_info.creator_ = [](const std::string& type, const VarNameMap& inputs,
+    op_info.creator_ = [](
-                          const VarNameMap& outputs,
+        const std::string& type, const VariableNameMap& inputs,
-                          const AttributeMap& attrs) {
+        const VariableNameMap& outputs, const AttributeMap& attrs) {
      return new OpType(type, inputs, outputs, attrs);
    };
    op_info.grad_op_type_ = grad_op_type;
@@ -70,7 +59,7 @@ class OpRegistry {
      op_info.proto_ = nullptr;
      op_info.checker_ = nullptr;
    }
-    op_info_map().insert(std::make_pair(op_type, op_info));
+    OpInfoMap::Instance().Insert(op_type, op_info);
    // register gradient op
    if (!grad_op_type.empty()) {
      RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
@@ -78,21 +67,13 @@ class OpRegistry {
  }
  static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
-                                                const VarNameMap& inputs,
+                                                const VariableNameMap& inputs,
-                                                const VarNameMap& outputs,
+                                                const VariableNameMap& outputs,
                                                AttributeMap attrs);
  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
-  static VarNameMap ConvertOpDescVarsToVarNameMap(
-      const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars);
  static std::unique_ptr<OperatorBase> CreateGradOp(const OperatorBase& op);
-  static std::unordered_map<std::string, const OpInfo>& op_info_map() {
-    static std::unordered_map<std::string, const OpInfo> op_info_map_;
-    return op_info_map_;
-  }
 };
 class Registrar {

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -115,8 +115,8 @@ void OperatorBase::Rename(const std::string& old_name,
 }
 OperatorBase::OperatorBase(const std::string& type,
-                           const OperatorBase::VarNameMap& inputs,
+                           const VariableNameMap& inputs,
-                           const OperatorBase::VarNameMap& outputs,
+                           const VariableNameMap& outputs,
                           const AttributeMap& attrs)
    : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
  static std::atomic<size_t> gUniqId(0UL);
@@ -141,18 +141,10 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
    }
    return ret_val;
  }
-  auto it = OpRegistry::op_info_map().find(type_);
+  auto& info = OpInfoMap::Instance().Get(Type());
-  PADDLE_ENFORCE(
-      it != OpRegistry::op_info_map().end(),
-      "Operator %s not registered, cannot figure out intermediate outputs",
-      type_);
-  PADDLE_ENFORCE(
-      it->second.proto_ != nullptr,
-      "Operator %s has no OpProto, cannot figure out intermediate outputs",
-      type_);
  // get all OpProto::Var for outputs
-  for (auto& o : it->second.proto_->outputs()) {
+  for (auto& o : info.Proto().outputs()) {
    // ignore all intermediate output
    if (o.intermediate()) continue;
    auto out = outputs_.find(o.name());

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
+#include "op_info.h"
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/scope.h"
@@ -62,10 +63,8 @@ class ExecutionContext;
 */
 class OperatorBase {
 public:
-  using VarNameMap = std::map<std::string, std::vector<std::string>>;
+  OperatorBase(const std::string& type, const VariableNameMap& inputs,
+               const VariableNameMap& outputs, const AttributeMap& attrs);
-  OperatorBase(const std::string& type, const VarNameMap& inputs,
-               const VarNameMap& outputs, const AttributeMap& attrs);
  virtual ~OperatorBase() {}
@@ -93,8 +92,8 @@ class OperatorBase {
  /// rename inputs outputs name
  void Rename(const std::string& old_name, const std::string& new_name);
-  const VarNameMap& Inputs() const { return inputs_; }
+  const VariableNameMap& Inputs() const { return inputs_; }
-  const VarNameMap& Outputs() const { return outputs_; }
+  const VariableNameMap& Outputs() const { return outputs_; }
  //! Get a input with argument's name described in `op_proto`
  const std::string& Input(const std::string& name) const;
  //! Get a input which has multiple variables.
@@ -122,30 +121,32 @@ class OperatorBase {
  // I (Inputs)opear
  // O (Outputs)
  // OG (Output Gradients)
-  VarNameMap inputs_;
+  VariableNameMap inputs_;
  // NOTE: in case of OpGrad, outputs_ contains
  // IG (Inputs Gradients)
-  VarNameMap outputs_;
+  VariableNameMap outputs_;
  AttributeMap attrs_;
 };
 // Macro for define a clone method.
 // If you are writing an kernel operator, `Clone` will be defined when you
 // register it. i.e. `Clone` method is not needed to define by yourself.
-#define DEFINE_OP_CLONE_METHOD(CLS)                       \
+#define DEFINE_OP_CLONE_METHOD(cls)                       \
  std::unique_ptr<OperatorBase> Clone() const final {     \
-    return std::unique_ptr<OperatorBase>(new CLS(*this)); \
+    return std::unique_ptr<OperatorBase>(new cls(*this)); \
  }
 // Macro for define a default constructor for Operator.
 // You can also use
 //   using PARENT_CLASS::PARENT_CLASS;
 // to use parent's constructor.
-#define DEFINE_OP_CONSTRUCTOR(CLS, PARENT_CLS)                                 \
+#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)             \
-  CLS(const std::string& type, const VarNameMap& inputs,                       \
+  cls(const std::string& type,                             \
-      const VarNameMap& outputs, const paddle::framework::AttributeMap& attrs) \
+      const ::paddle::framework::VariableNameMap& inputs,  \
-      : PARENT_CLS(type, inputs, outputs, attrs) {}
+      const ::paddle::framework::VariableNameMap& outputs, \
+      const paddle::framework::AttributeMap& attrs)        \
+      : parent_cls(type, inputs, outputs, attrs) {}
 class NOP : public OperatorBase {
 public:
@@ -389,8 +390,8 @@ class OperatorWithKernel : public OperatorBase {
  using OpKernelMap =
      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
-  OperatorWithKernel(const std::string& type, const VarNameMap& inputs,
+  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
-                     const VarNameMap& outputs, const AttributeMap& attrs)
+                     const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void InferShape(const Scope& scope) const override {

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -23,8 +23,8 @@ static int op_run_num = 0;
 class OpWithoutKernelTest : public OperatorBase {
 public:
-  OpWithoutKernelTest(const std::string& type, const VarNameMap& inputs,
+  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
-                      const VarNameMap& outputs, const AttributeMap& attrs)
+                      const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs), x(1) {}
  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
@@ -249,8 +249,9 @@ TEST(OpKernel, multi_inputs) {
 class OperatorClone : public paddle::framework::OperatorBase {
 public:
  DEFINE_OP_CLONE_METHOD(OperatorClone);
-  OperatorClone(const std::string& type, const VarNameMap& inputs,
+  OperatorClone(const std::string& type,
-                const VarNameMap& outputs,
+                const paddle::framework::VariableNameMap& inputs,
+                const paddle::framework::VariableNameMap& outputs,
                const paddle::framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void InferShape(const paddle::framework::Scope& scope) const override {}

--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -105,7 +105,10 @@ class Tensor {
  template <typename T>
  inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
-  platform::Place place() const { return holder_->place(); }
+  platform::Place place() const {
+    PADDLE_ENFORCE_NOT_NULL(holder_, "Tensor get place() must contains holder");
+    return holder_->place();
+  }
 private:
  template <typename T>

--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -4,6 +4,10 @@ file(GLOB cpp_files . *Op.cpp)
 list(APPEND h_files Function.h)
 list(APPEND cpp_files Function.cpp)
 list(APPEND cpp_files BufferArg.cpp)
+list(APPEND cpp_files GemmFunctor.cpp)
+if(USE_EIGEN_FOR_BLAS)
+  list(APPEND cpp_files EigenGemm.cpp)
+endif(USE_EIGEN_FOR_BLAS)
 if(WITH_GPU)
    file(GLOB cu_files . *OpGpu.cu)

--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -14,7 +14,6 @@ limitations under the License. */
 #include "DepthwiseConvOp.h"
 #include "ConvOp.h"
-#include "GemmFunctor.h"
 namespace paddle {

--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "DepthwiseConvOp.h"
-#include "GemmFunctor.h"
 #include "paddle/math/BaseMatrix.h"
 namespace paddle {

--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <glog/logging.h>
+#include "unsupported/Eigen/CXX11/Tensor"
+namespace paddle {
+template <class T>
+struct EigenBlasGemm {
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
+                           Eigen::Aligned>
+      Matrix;
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    Eigen::array<int, 2> sizeA;
+    if (transA) {
+      sizeA[0] = K;
+      sizeA[1] = M;
+      CHECK_EQ(M, lda);
+    } else {
+      sizeA[0] = M;
+      sizeA[1] = K;
+      CHECK_EQ(K, lda);
+    }
+    Eigen::array<int, 2> sizeB;
+    if (transB) {
+      sizeB[0] = N;
+      sizeB[1] = K;
+      CHECK_EQ(K, ldb);
+    } else {
+      sizeB[0] = K;
+      sizeB[1] = N;
+      CHECK_EQ(N, ldb);
+    }
+    Eigen::array<int, 2> sizeC;
+    sizeC[0] = M;
+    sizeC[1] = N;
+    CHECK_EQ(N, ldc);
+    const Matrix a(const_cast<T*>(A), sizeA);
+    const Matrix b(const_cast<T*>(B), sizeB);
+    Matrix c(C, sizeC);
+    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    dims[0].first = transA ? 0 : 1;
+    dims[0].second = transB ? 1 : 0;
+    Eigen::DefaultDevice device;
+    if (alpha == T(1) && beta == T(0)) {
+      c.device(device) = a.contract(b, dims);
+    } else if (alpha == T(1) && beta == T(1)) {
+      c.device(device) += a.contract(b, dims);
+    } else {
+      c.device(device) = alpha * a.contract(b, dims) + beta * c;
+    }
+  }
+};
+#ifdef PADDLE_TYPE_DOUBLE
+template class EigenBlasGemm<double>;
+#else
+template class EigenBlasGemm<float>;
+#endif
+}  // namespace paddle
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -85,7 +85,6 @@ public:
    }
    Im2ColFunctor<kCFO, Device, real> im2col;
-    GemmFunctor<Device, real> gemm;
    size_t inputOffset = imShape.getElements();
    size_t outputOffset =
        (outputChannels / groups_) * outputHeight * outputWidth;
@@ -108,19 +107,19 @@ public:
        int M = outputChannels / groups_;
        int N = outputHeight * outputWidth;
        int K = inputChannels / groups_ * filterHeight * filterWidth;
-        gemm(CblasNoTrans,
+        BlasGemm<Device, real>::compute(false,
-             CblasNoTrans,
+                                        false,
-             M,
+                                        M,
-             N,
+                                        N,
-             K,
+                                        K,
-             1.0f,
+                                        1.0f,
-             filterData + g * filterOffset,
+                                        filterData + g * filterOffset,
-             K,
+                                        K,
-             colData,
+                                        colData,
-             N,
+                                        N,
-             beta,
+                                        beta,
-             outputData + g * outputOffset,
+                                        outputData + g * outputOffset,
-             N);
+                                        N);
      }
      inputData += inputChannels * inputHeight * inputWidth;
      outputData += outputChannels * outputHeight * outputWidth;
@@ -188,8 +187,6 @@ public:
    }
    Col2ImFunctor<kCFO, Device, real> col2im;
-    GemmFunctor<Device, real> gemm;
    size_t inputOffset = imShape.getElements();
    size_t outputOffset =
        (outputChannels / groups_) * outputHeight * outputWidth;
@@ -205,19 +202,19 @@ public:
          colData = inputGrad + g * inputOffset;
          scale = 1.0f;
        }
-        gemm(CblasTrans,
+        BlasGemm<Device, real>::compute(true,
-             CblasNoTrans,
+                                        false,
-             M,
+                                        M,
-             N,
+                                        N,
-             K,
+                                        K,
-             1.0f,
+                                        1.0f,
-             filterData + g * filterOffset,
+                                        filterData + g * filterOffset,
-             M,
+                                        M,
-             outputGrad + g * outputOffset,
+                                        outputGrad + g * outputOffset,
-             N,
+                                        N,
-             scale,
+                                        scale,
-             colData,
+                                        colData,
-             N);
+                                        N);
        if (needIm2col) {
          col2im(inputGrad + g * inputOffset,
                 imShape,
@@ -299,7 +296,6 @@ public:
    }
    Im2ColFunctor<kCFO, Device, real> im2col;
-    GemmFunctor<Device, real> gemm;
    size_t inputOffset = imShape.getElements();
    size_t outputOffset =
        (outputChannels / groups_) * outputHeight * outputWidth;
@@ -321,19 +317,19 @@ public:
        int M = outputChannels / groups_;
        int K = outputHeight * outputWidth;
        int N = inputChannels / groups_ * filterHeight * filterWidth;
-        gemm(CblasNoTrans,
+        BlasGemm<Device, real>::compute(false,
-             CblasTrans,
+                                        true,
-             M,
+                                        M,
-             N,
+                                        N,
-             K,
+                                        K,
-             1.0f,
+                                        1.0f,
-             outputGrad + g * outputOffset,
+                                        outputGrad + g * outputOffset,
-             K,
+                                        K,
-             colData,
+                                        colData,
-             K,
+                                        K,
-             i == 0 ? beta : 1.0f,
+                                        i == 0 ? beta : 1.0f,
-             filterGrad + g * filterOffset,
+                                        filterGrad + g * filterOffset,
-             N);
+                                        N);
      }
      inputData += inputChannels * inputHeight * inputWidth;
      outputGrad += outputChannels * outputHeight * outputWidth;

--- a/paddle/function/GemmFunctor.cpp
+++ b/paddle/function/GemmFunctor.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "GemmFunctor.h"
+#include "paddle/math/MathFunctions.h"
+namespace paddle {
+template <class T>
+struct BlasGemm<DEVICE_TYPE_CPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+#ifdef PADDLE_USE_EIGEN_FOR_BLAS
+    EigenBlasGemm<T>::compute(
+        transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+#else
+    gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
+            transB == false ? CblasNoTrans : CblasTrans,
+            M,
+            N,
+            K,
+            alpha,
+            A,
+            lda,
+            B,
+            ldb,
+            beta,
+            C,
+            ldc);
+#endif
+  }
+};
+template <class T>
+struct BlasGemm<DEVICE_TYPE_GPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    hl_matrix_mul((T*)A,
+                  transA == false ? HPPL_OP_N : HPPL_OP_T,
+                  (T*)B,
+                  transB == false ? HPPL_OP_N : HPPL_OP_T,
+                  C,
+                  M,
+                  N,
+                  K,
+                  alpha,
+                  beta,
+                  lda,
+                  ldb,
+                  ldc);
+  }
+};
+template struct BlasGemm<DEVICE_TYPE_CPU, real>;
+template struct BlasGemm<DEVICE_TYPE_GPU, real>;
+}  // namespace paddle
--- a/paddle/function/GemmFunctor.h
+++ b/paddle/function/GemmFunctor.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #pragma once
-#include "paddle/math/MathFunctions.h"
+#include "TensorType.h"
 namespace paddle {
@@ -24,73 +24,42 @@ namespace paddle {
 // of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul
 // interface.
 template <DeviceType Device, class T>
-class GemmFunctor {
+struct BlasGemm {
-public:
+  static void compute(const bool transA,
-  void operator()(const CBLAS_TRANSPOSE transA,
+                      const bool transB,
-                  const CBLAS_TRANSPOSE TransB,
+                      const int M,
-                  const int M,
+                      const int N,
-                  const int N,
+                      const int K,
-                  const int K,
+                      const T alpha,
-                  const T alpha,
+                      const T* A,
-                  const T* A,
+                      const int lda,
-                  const int lda,
+                      const T* B,
-                  const T* B,
+                      const int ldb,
-                  const int ldb,
+                      const T beta,
-                  const T beta,
+                      T* C,
-                  T* C,
+                      const int ldc);
-                  const int ldc);
 };
+// TODO(hedaoyuan): Since the definition of the real type in the Paddle
+// conflicts with the Eigen library, so compile the Eigen code can not
+// include the Paddle header file. And need an EigenBlasGemm template class
+// that does not contain the DeviceType parameter.
+// I will fix this problem and merge BlasGemm and EigenBlasGemm into one.
 template <class T>
-class GemmFunctor<DEVICE_TYPE_CPU, T> {
+struct EigenBlasGemm {
-public:
+  static void compute(const bool transA,
-  void operator()(const CBLAS_TRANSPOSE transA,
+                      const bool transB,
-                  const CBLAS_TRANSPOSE TransB,
+                      const int M,
-                  const int M,
+                      const int N,
-                  const int N,
+                      const int K,
-                  const int K,
+                      const T alpha,
-                  const T alpha,
+                      const T* A,
-                  const T* A,
+                      const int lda,
-                  const int lda,
+                      const T* B,
-                  const T* B,
+                      const int ldb,
-                  const int ldb,
+                      const T beta,
-                  const T beta,
+                      T* C,
-                  T* C,
+                      const int ldc);
-                  const int ldc) {
-    gemm<T>(transA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
-  }
-};
-template <class T>
-class GemmFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE TransB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const T alpha,
-                  const T* A,
-                  const int lda,
-                  const T* B,
-                  const int ldb,
-                  const T beta,
-                  T* C,
-                  const int ldc) {
-    hl_matrix_mul((T*)A,
-                  transA == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
-                  (T*)B,
-                  TransB == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
-                  C,
-                  M,
-                  N,
-                  K,
-                  alpha,
-                  beta,
-                  lda,
-                  ldb,
-                  ldc);
-  }
 };
 }  // namespace paddle
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -202,7 +202,7 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
            para->getMat(PARAMETER_VALUE).get());
        para->clearGradient();
-        mat->clearIndices();
+        if (mat) mat->clearIndices();
      }
    }
  }

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -184,7 +184,7 @@ public:
  }
  void backward(const UpdateCallback& callback) override {
-    if (biases_) {
+    if (biases_ && biases_->getWGrad()) {
      backwardActivation();
      biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
      biases_->getParameterPtr()->incUpdate(callback);
@@ -1012,11 +1012,6 @@ void RecurrentGradientMachine::generateSequence() {
                           /* width */ resultNum,
                           false,
                           /* useGpu */ false);
-    Matrix::resizeOrCreate(generator_.outArg.value,
-                           /* height */ maxGenWordCount,
-                           /* width */ 1,
-                           false,
-                           /* useGpu */ false);
  }
  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
                                numSequences + 1,
@@ -1026,7 +1021,7 @@ void RecurrentGradientMachine::generateSequence() {
  } else {
    oneWaySearch(numSequences);
  }
-  if (dataArgsSize_) createDataOutlink(batchMachineIdVec_);
+  if (dataArgsSize_) createDataOutlink();
  size_t size = generator_.ids.size();
  generator_.outArg.ids->resize(size);
@@ -1106,6 +1101,7 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
  }
  batchMachineIdVec_.clear();
+  batchMachineStartPos_.clear();
  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
  starts[0] = 0;
  generator_.ids.clear();
@@ -1312,13 +1308,20 @@ void RecurrentGradientMachine::fillGenOutputs() {
    finalPaths_[i].resize(minFinalPathsSize);
  }
-  batchMachineIdVec_.clear();
  generator_.ids.clear();
  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
  starts[0] = 0;
  if (numResults > 1) {
-    real* probs = generator_.outArg.in->getData();
+    int idsProbSaveSize = 0;
+    for (auto inSeq : finalPaths_) {
+      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
+      idsProbSaveSize += inSeq.size();
+    }
+    Matrix::resizeOrCreate(
+        generator_.outArg.value, idsProbSaveSize, 1, false, false);
    real* idsProb = generator_.outArg.value->getData();
+    real* probs = generator_.outArg.in->getData();
    size_t curPos = 0;
    for (size_t i = 0; i < finalPaths_.size(); ++i) {
      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
@@ -1333,24 +1336,16 @@ void RecurrentGradientMachine::fillGenOutputs() {
        curPos += genLen;
        idsProb[curPos++] = -1.0;
        probs[i * numResults + j] = path.logProb;
-        if (!j && dataArgsSize_) {
-          // in beam search, here only reserved the top 1 generated result
-          // for out_links that are not the generated word indices.
-          batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                                    path.machineIdVec.begin(),
-                                    path.machineIdVec.end());
-        }
      }
      starts[i + 1] = generator_.ids.size();
    }
  } else {
    for (size_t i = 0; i < finalPaths_.size(); ++i) {
      CHECK(!finalPaths_[i].empty());
-      generator_.ids.insert(generator_.ids.begin(),
+      Path& path = finalPaths_[i][0];
-                            finalPaths_[i][0].ids.begin(),
+      generator_.ids.insert(
-                            finalPaths_[i][0].ids.end());
+          generator_.ids.begin(), path.ids.begin(), path.ids.end());
-      starts[i + 1] = starts[i] + finalPaths_[i][0].ids.size();
+      starts[i + 1] = starts[i] + path.ids.size();
    }
  }
 }
@@ -1364,25 +1359,76 @@ void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
  }
 }
-void RecurrentGradientMachine::createDataOutlink(
+void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
-    std::vector<int>& machineIdVec) {
+    bool isSeq, std::vector<Argument>& outArgs) {
-  size_t seqNum =
+  batchMachineIdVec_.clear();
-      getBeamSize() > 1UL ? finalPaths_.size() : finalPaths_[0].size();
-  std::vector<int> starts(seqNum + 1, 0);
+  size_t seqIdx = 0;
-  for (size_t i = 0; i < seqNum; ++i) {
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
-    size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size()
+    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-                                        : finalPaths_[0][i].ids.size();
+      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
-    starts[i + 1] = starts[i] + seqLen;
+      if (isSeq) {
+        for (size_t i = 0; i < machineIdVec.size(); ++i) {
+          size_t rowId = machineIdVec[i];
+          int* seqPos =
+              outArgs[i].sequenceStartPositions->getMutableData(false);
+          batchMachineIdVec_.push_back(seqPos[rowId]);
+        }
+      } else {
+        batchMachineIdVec_.insert(
+            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
+      }
+      seqIdx++;
+    }
+  }
+}
+void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
+    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
+  size_t totalSeqNum = std::accumulate(
+      finalPaths_.begin(),
+      finalPaths_.end(),
+      0UL,
+      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
+  copySize.resize(totalSeqNum, 1);
+  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
+  if (isSeq) {
+    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
+    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
+             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
+    int* starts = inputSeqStartPos->getMutableData(false);
+    int seqId = 0;
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
+                                            : starts[j + 1] - starts[j];
+        batchMachineStartPos_[seqId + 1] =
+            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
+        seqId++;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
+      batchMachineStartPos_[i + 1] =
+          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
  }
+}
+void RecurrentGradientMachine::createDataOutlink() {
  for (size_t i = 0; i < dataArgsSize_; i++) {
+    bool isSeq = dataArgsFrame_[i][0].hasSeq();
+    std::vector<int> copySize;
+    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
+    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
    dataArgs_[i].concat(dataArgsFrame_[i],
-                        machineIdVec,
+                        batchMachineIdVec_,
-                        starts,
+                        batchMachineStartPos_,
+                        copySize,
                        useGpu_,
                        HPPL_STREAM_1,
                        PASS_TEST);
    auto dataAgent =
        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
    CHECK_NOTNULL(dataAgent);

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -190,7 +190,7 @@ public:
    std::vector<int> ids;
    /**
-     * @brief idsProb, log probability of each generated words.
+     * @brief idsProb, log probability of each generated word.
     */
    std::vector<real> idsProb;
@@ -472,15 +472,43 @@ private:
  void copyDataOutlinkFrame(size_t machineCur);
  /*
-   * @brief In generation, if the layer group has more than 1 outlink, outlinks
+   * @brief In generation, if the layer group has more than 1 outlink, outlink
-   * except the first one are data outlinks. This function creates the data
+   * except the first one is a data outlink. In RecurrentLayerGroup, each time
-   * outlinks.
+   * step is a separate Network, outputs of a layer inside the
-   * @note In beam search, only one generated sequence with the hightest log
+   * RecurrentLayerGroup are stored in separate Arguments. If one layer is
-   * probabilites are retained.
+   * specified as an outlink of RecurrentLayerGroup. This function will
-   * @param machineIdVec : select a row of output matrix in each frame
+   * collect outputs in each time step of each generated sequence which are
-   * that the generation process expanded.
+   * dispersed in separate Arguments to form a new single Argument as output of
+   * RecurrentLayerGroup.
   */
-  void createDataOutlink(std::vector<int>& machineIdVec);
+  void createDataOutlink();
+  /*
+   * @brief decide to select how many rows from the Matrix stored the forward
+   * pass results from a start position.
+   *
+   * @param isSeq: a flag indicating whetehr the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the the returned Arguments of the forward pass
+   * during the generation process.
+   * @param copySize: the returned result, number of rows to select from the
+   * Matrix stored the forward pass results from a start position.
+   */
+  void createDataOutlinkCopySizeInfo(bool isSeq,
+                                     std::vector<Argument>& outArgs,
+                                     std::vector<int>& copySize);
+  /*
+   * @brief decide index of the start row for each time step of a generated
+   * sequence in Matrix stored the entire beam search batch's forward pass
+   * results.
+   *
+   * @param isSeq: a flag indicating whether the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the returned Arguments of the forward pass
+   * during the generation process.
+   */
+  void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);
  /*
   * @brief used in beam search, connect previous frame to form recurrent link
@@ -543,6 +571,7 @@ private:
  std::vector<int> topIds_;
  std::vector<int> seqIds_;
  std::vector<int> batchMachineIdVec_;
+  std::vector<int> batchMachineStartPos_;
  std::vector<std::vector<Path>> finalPaths_;
  std::vector<real> minFinalPathLogProb_;
  BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;

--- a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
@@ -80,13 +80,14 @@ void KmaxSeqScoreLayer::forward(PassType passType) {
      << "input of " << getName()
      << " must be a sequence or a nested sequence.";
  CHECK_EQ(input.value->getWidth(), 1UL)
-      << "input of " << getName()
+      << "input of " << getName() << " are scores over a sequence or "
-      << " is score over a sequence or a nested sequence, so its width "
+      << "a nested sequence, so its width must be 1.";
-      << " must be 1.";
  if (useGpu_) {
-    // this Layer runs only in CPU, if the model is runing on GPU,
+    /*
-    // then copy the input to this layer from GPU to CPU.
+     * currently, this Layer only runs in CPU, if the other part of the model is
+     * runing on GPU, then copy the input to this layer from GPU to CPU.
+     */
    Matrix::resizeOrCreate(scores_,
                           inputScore->getHeight(),
                           1,
@@ -97,6 +98,14 @@ void KmaxSeqScoreLayer::forward(PassType passType) {
    scores_ = inputScore;
  }
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but output of this layer which is some selected indices of the give
+   * sequence are actually filled with int types so that storing int types
+   * information in a real number matrix is dangerous, since real numbers will
+   * be convered to int types.
+   */
  Matrix::resizeOrCreate(
      output_.value,
      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),

--- a/paddle/gserver/layers/ScaleShiftLayer.cpp
+++ b/paddle/gserver/layers/ScaleShiftLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "Layer.h"
+namespace paddle {
+/**
+ * A layer applies a linear transformation to each element in each row of
+ * the input matrix. For each element, the layer first re-scale it and then
+ * adds a bias to it.
+ *
+ * \f[
+ *    y = wx + b
+ * \f]
+ *
+ * Here, w is the scale and b is the bias. Both w and b are trainable scalars.
+ *
+ */
+class ScaleShiftLayer : public Layer {
+protected:
+  std::unique_ptr<Weight> scale_;
+  std::unique_ptr<Weight> offset_;
+public:
+  explicit ScaleShiftLayer(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+REGISTER_LAYER(scale_shift, ScaleShiftLayer);
+bool ScaleShiftLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 1U);
+  scale_.reset(new Weight(1, 1, parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    offset_ = std::unique_ptr<Weight>(new Weight(1, 1, biasParameter_));
+  }
+  return true;
+}
+void ScaleShiftLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inV = getInputValue(0);
+  resetOutput(inV->getHeight(), inV->getWidth());
+  MatrixPtr outV = getOutputValue();
+  real scaleValue = scale_->getW()->getElement(0, 0);
+  outV->mulScalar(*inV, scaleValue);
+  if (offset_) {
+    real offsetValue = offset_->getW()->getElement(0, 0);
+    outV->add(offsetValue);
+  }
+}
+void ScaleShiftLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+  /* Calculate the parameter gradient for the current layer */
+  if (scale_->getWGrad()) {
+    MatrixPtr rowSumMtx;
+    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
+    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
+    rowSumMtx->sumOfProducts(
+        /* b= */ *inV, /* c= */ *outG, /* scaleSum= */ 1, /* scaleDest= */ 0.);
+    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
+    scale_->getWGrad()->sumCols(
+        /* b= */ *rowSumMtx, /* scaleSum= */ 1., /* scaleDest= */ 1.);
+    scale_->getParameterPtr()->incUpdate(callback);
+  }
+  if (offset_ && offset_->getWGrad()) {
+    MatrixPtr rowSumMtx;
+    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
+    rowSumMtx->sumRows(*outG, 1., 0.);
+    offset_->getWGrad()->sumCols(*rowSumMtx, 1., 1.);
+    offset_->getParameterPtr()->incUpdate(callback);
+  }
+  /* Calculate the input layers error */
+  if (inG) {
+    real scaleValue = scale_->getW()->getElement(0, 0);
+    inG->add(*outG, scaleValue);
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ b/paddle/gserver/layers/SequenceSliceLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+class SequenceSliceLayer : public Layer {
+public:
+  explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+private:
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second and the (optional) third input which are some
+   * selected indices of the give sequence to trim the sequence, are actually
+   * filled with int types so that storing int types information in real number
+   * matrices is very dangerous, since real numbers will be convered to int
+   * types. If a user fills this matrix himself, invalid data may occor.
+   */
+  MatrixPtr startIdsOnCpu_;
+  MatrixPtr endIdsOnCpu_;
+  std::vector<int> selectedRows_;
+  IVectorPtr rowIndice_;
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+  std::vector<int> outSubSeqStartPos_;
+  std::vector<int> outSeqStartPos_;
+  void checkInputs();
+  void copySliceIdsToCpu();
+  void calSelectedRows(const MatrixPtr starts, const MatrixPtr ends);
+};
+REGISTER_LAYER(seq_slice, SequenceSliceLayer);
+bool SequenceSliceLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_GE(inputLayers_.size(), 2U);
+  CHECK_LE(inputLayers_.size(), 3U);
+  setNeedSequenceInfo(false);
+  return true;
+}
+void SequenceSliceLayer::checkInputs() {
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
+                           << "must be a sequence.";
+  const MatrixPtr indices1 = getInputValue(1);
+  CHECK_EQ(static_cast<size_t>(indices1->getHeight()),
+           inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
+                                : inputSeq.getNumSequences())
+      << "Height of the second input should be equal to number of sequence "
+      << "in the first input.";
+  if (inputLayers_.size() == 3) {
+    const MatrixPtr indices2 = getInputValue(2);
+    CHECK_EQ(indices2->getHeight(), indices1->getHeight())
+        << "start indices and end indices should have the same height.";
+    CHECK_EQ(indices2->getWidth(), indices1->getWidth())
+        << "start indices and end indices should have the same Width.";
+  }
+}
+void SequenceSliceLayer::copySliceIdsToCpu() {
+  const MatrixPtr indices1 = getInputValue(1);
+  if (inputLayers_.size() == 2U) {
+    if (config_.select_first()) {
+      Matrix::resizeOrCreate(startIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      startIdsOnCpu_->copyFrom(*indices1);
+      endIdsOnCpu_ = nullptr;
+    } else {
+      Matrix::resizeOrCreate(endIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      endIdsOnCpu_->copyFrom(*indices1);
+      startIdsOnCpu_ = nullptr;
+    }
+  } else if (inputLayers_.size() == 3U) {
+    Matrix::resizeOrCreate(startIdsOnCpu_,
+                           indices1->getHeight(),
+                           indices1->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    startIdsOnCpu_->copyFrom(*indices1);
+    const MatrixPtr indices2 = getInputValue(2);
+    Matrix::resizeOrCreate(endIdsOnCpu_,
+                           indices2->getHeight(),
+                           indices2->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    endIdsOnCpu_->copyFrom(*indices2);
+  }
+}
+void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
+                                         const MatrixPtr ends) {
+  CHECK(starts || ends) << "At least one of the start or end indices "
+                        << "should be given.";
+  outSeqStartPos_.resize(1, 0);
+  outSubSeqStartPos_.resize(1, 0);
+  selectedRows_.clear();
+  size_t beamSize = starts ? starts->getWidth() : ends->getWidth();
+  size_t rowIdx = 0;
+  for (size_t i = 0; i < inputSeqInfoVec_.size(); ++i) {
+    for (size_t j = 0; j < inputSeqInfoVec_[i].size() - 1; ++j) {
+      for (size_t k = 0; k < beamSize; ++k) {
+        if (starts && starts->getElement(rowIdx, k) == -1.) break;
+        if (ends && ends->getElement(rowIdx, k) == -1.) break;
+        int begPos = inputSeqInfoVec_[i][j];
+        if (starts) begPos += starts->getElement(rowIdx, k);
+        int endPos = inputSeqInfoVec_[i][j + 1] - 1;
+        if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
+        int seqLen = endPos - begPos + 1;
+        CHECK_GT(seqLen, 0U);
+        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
+        inputSeqInfoVec_.size() > 1
+            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
+            : outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
+      }
+      rowIdx++;
+    }
+    if (inputSeqInfoVec_.size() > 1)
+      outSeqStartPos_.push_back(outSubSeqStartPos_.back());
+  }
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartPos_.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartPos_.data(), outSeqStartPos_.size(), false);
+  if (inputSeqInfoVec_.size() > 1) {
+    ICpuGpuVector::resizeOrCreate(
+        output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
+    output_.subSequenceStartPositions->copyFrom(
+        outSubSeqStartPos_.data(), outSubSeqStartPos_.size(), false);
+  }
+}
+void SequenceSliceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  checkInputs();
+  const Argument& inputSeq = getInput(0);
+  inputSeqInfoVec_.clear();
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  if (!useGpu_) {
+    if (inputLayers_.size() == 2U) {
+      startIdsOnCpu_ = config_.select_first() ? getInputValue(1) : nullptr;
+      endIdsOnCpu_ = config_.select_first() ? nullptr : getInputValue(1);
+    } else if (inputLayers_.size() == 3U) {
+      startIdsOnCpu_ = getInputValue(1);
+      endIdsOnCpu_ = getInputValue(2);
+    }
+  } else {
+    copySliceIdsToCpu();
+  }
+  // calculate the selected row indices in a batch,
+  // and build the output sequence information.
+  calSelectedRows(startIdsOnCpu_ ? startIdsOnCpu_ : nullptr,
+                  endIdsOnCpu_ ? endIdsOnCpu_ : nullptr);
+  resetOutput(selectedRows_.size(), getSize());
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+void SequenceSliceLayer::backward(const UpdateCallback& callback) {
+  getOutputGrad()->addToRows(*getInputGrad(0), *rowIndice_);
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/SubNestedSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
@@ -52,23 +52,34 @@ private:
   *   ]
   *
   * ths output is saved to private member rowIndice_;
-   * [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
+   * [0,1,2,3,4,5,6,7,8,9,15,16,17,18,19,20,21,23,24,25,26,27]
-   *  16,17,18,19,20,21,22,23,24,25,26,27]
   */
-  void calSelectedCols(const MatrixPtr selectedIndices,
+  void calSelectedRows(const MatrixPtr selectedIndices,
                       const std::vector<std::vector<int>>& inputSeqInfo);
-  // if the second input of this layer is on GPU memory, copy it to CPU memory.
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second is some selected indices of the give sequence to trim
+   * the nested sequence, are actually filled with int types so that storing
+   * int types information in real number matrices is very dangerous, since
+   * real numbers will be convered to int types. If a user fills this matrix
+   * himself, invalid data may occor.
+   *
+   * if the second input of this layer is on GPU memory, copy it to CPU memory.
+   */
  MatrixPtr selIdsCpu_;
-  // reorganized sequenceStartPositions and subSequenceStartPositions
+  /*
-  // into a 2d vector to facilitate the sequence selection process.
+   * reorganize sequenceStartPositions and subSequenceStartPositions
+   * into a 2d vector to facilitate the sequence selection process.
+   */
  std::vector<std::vector<int>> inputSeqInfoVec_;
-  // the final selected row indices in a batch,
+  /* store the final selected row indices in a batch */
-  // rowIdx_ and selectedRows_ actually share a same memory.
  IVectorPtr rowIndice_;
+  /* rowIndice_ and selectedRows_ actually share a same memory. */
  std::vector<int> selectedRows_;
 };
@@ -83,7 +94,7 @@ bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
  return true;
 }
-void SubNestedSequenceLayer::calSelectedCols(
+void SubNestedSequenceLayer::calSelectedRows(
    const MatrixPtr selectedIndices,
    const std::vector<std::vector<int>>& inputSeqInfo) {
  selectedRows_.clear();
@@ -160,7 +171,7 @@ void SubNestedSequenceLayer::forward(PassType passType) {
  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
                              inputSeq.subSequenceStartPositions,
                              inputSeqInfoVec_);
-  calSelectedCols(selIdsCpu_, inputSeqInfoVec_);
+  calSelectedRows(selIdsCpu_, inputSeqInfoVec_);
  resetOutput(selectedRows_.size(), getSize());
  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);

--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -34,6 +34,12 @@ add_unittest_without_exec(test_CRFLayerGrad
 add_test(NAME test_CRFLayerGrad
    COMMAND test_CRFLayerGrad)
+################ test_SeqSliceLayerGrad ####################
+add_unittest_without_exec(test_SeqSliceLayerGrad
+    test_SeqSliceLayerGrad.cpp
+    LayerGradUtil.cpp)
+add_test(NAME test_SeqSliceLayerGrad
+    COMMAND test_SeqSliceLayerGrad)
 add_unittest_without_exec(test_ActivationGrad
    test_ActivationGrad.cpp

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2025,6 +2025,21 @@ TEST(Layer, RowL2NormLayer) {
  }
 }
+TEST(Layer, ScaleShiftLayer) {
+  const size_t batchSize = 16;
+  const size_t size = 32;
+  TestConfig config;
+  config.layerConfig.set_type("scale_shift");
+  config.layerConfig.set_size(size);
+  config.biasSize = 1;
+  config.inputDefs.push_back(
+      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
+  }
+}
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);

--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -269,7 +269,8 @@ TEST(Compare, img_conv2) {
  bool useGpu = FLAGS_use_gpu;
  double eps = FLAGS_checkgrad_eps;
  FLAGS_use_gpu = true;
-  FLAGS_checkgrad_eps = 1e-2;
+  // Sometimes, this unit test will fail with 1e-2
+  FLAGS_checkgrad_eps = 4e-2;
  compareNetwork(config_file_a, config_file_b);
  FLAGS_use_gpu = useGpu;
  FLAGS_checkgrad_eps = eps;

--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/trainer/Trainer.h"
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+const int MAX_SEQ_NUM = 17;
+const int MAX_SEQ_LEN = 23;
+const int MAX_BEAM_SIZE = 13;
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
+  seqStartPos.resize(1, 0);
+  subSeqStartPos.resize(1, 0);
+  srand((size_t)(time(NULL)));
+  int seqNum = 1 + (rand() % MAX_SEQ_NUM);
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);
+    for (int j = 0; j < subSeqNum; ++j)
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % MAX_SEQ_LEN)));
+    seqStartPos.push_back(subSeqStartPos.back());
+  }
+}
+/*
+  generate start indices according to sequence start positions.
+ */
+void genStarts(vector<int>& seqStartPos,
+               vector<vector<real>>& starts,
+               size_t beamSize) {
+  starts.clear();
+  starts.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
+    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+    vector<real> randStarts =
+        randSampling(seqLen, min(seqLen, static_cast<int>(beamSize)));
+    copy(begin(randStarts), end(randStarts), begin(starts[i]));
+  }
+}
+/*
+  generate end indices according to sequence start positions and start indices.
+ */
+void genEnds(vector<int>& seqStartPos,
+             vector<vector<real>>& starts,
+             vector<vector<real>>& ends,
+             size_t beamSize) {
+  CHECK_EQ(seqStartPos.size() - 1, starts.size());
+  ends.clear();
+  ends.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+  for (size_t i = 0; i < starts.size(); ++i) {
+    for (size_t j = 0; j < starts[i].size(); ++j) {
+      int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+      CHECK_GE(seqLen - 1, starts[i][j]);
+      if (starts[i][j] == -1.) break;
+      if (starts[i][j] == (seqLen - 1)) {
+        ends[i][j] = starts[i][j];
+      } else {
+        ends[i][j] = starts[i][j] + randSampling(seqLen - starts[i][j], 1)[0];
+      }
+    }
+  }
+}
+void genTestData(vector<int>& seqStartPos,
+                 vector<int>& subSeqStartPos,
+                 vector<vector<real>>& starts,
+                 vector<vector<real>>& ends,
+                 bool hasSubseq) {
+  size_t beamSize = 1 + (rand() % MAX_BEAM_SIZE);
+  genSeqInfo(seqStartPos, subSeqStartPos);
+  genStarts(hasSubseq ? subSeqStartPos : seqStartPos, starts, beamSize);
+  genEnds(hasSubseq ? subSeqStartPos : seqStartPos, starts, ends, beamSize);
+}
+template <typename T>
+void flatten2dVector(vector<vector<T>>& inVec, vector<T>& outVec) {
+  size_t totalSize{0};
+  for (auto const& items : inVec) totalSize += items.size();
+  outVec.reserve(totalSize);
+  for (auto& items : inVec)
+    move(items.begin(), items.end(), back_inserter(outVec));
+}
+void testSeqSliceLayer(bool hasSubseq,
+                       bool useGpu,
+                       vector<int>& seqStartPos,
+                       vector<int>& subSeqStartPos,
+                       vector<vector<real>>& starts,
+                       vector<vector<real>>& ends) {
+  // layer size is not crutial for this layer,
+  // so here use a small layer size in the unittest.
+  const size_t layerSize{4};
+  TestConfig config;
+  config.layerConfig.set_type("seq_slice");
+  config.layerConfig.set_size(layerSize);
+  // add the first input
+  MatrixPtr seqInputPtr =
+      Matrix::create(hasSubseq ? subSeqStartPos.back() : seqStartPos.back(),
+                     layerSize,
+                     false,
+                     false);
+  seqInputPtr->randomizeUniform();
+  if (hasSubseq) {
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                "seq_input",
+                                seqInputPtr,
+                                seqStartPos,
+                                subSeqStartPos});
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "seq_input", seqInputPtr, seqStartPos});
+  }
+  config.layerConfig.add_inputs();
+  // add start indices
+  if (starts.size()) {
+    vector<real> startsToVec;
+    flatten2dVector(starts, startsToVec);
+    MatrixPtr startMatrixPtr =
+        Matrix::create(starts.size(), starts[0].size(), false, false);
+    startMatrixPtr->copyFrom(startsToVec.data(), startsToVec.size());
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "starts", startMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(true);
+  }
+  // add end indices
+  if (ends.size()) {
+    vector<real> endsToVec;
+    flatten2dVector(ends, endsToVec);
+    MatrixPtr endMatrixPtr =
+        Matrix::create(ends.size(), ends[0].size(), false, false);
+    endMatrixPtr->copyFrom(endsToVec.data(), endsToVec.size());
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "ends", endMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(false);
+  }
+  testLayerGrad(config, "seq_slice", /*batchSize*/ 100, false, useGpu, false);
+}
+TEST(Layer, SeqSliceLayer) {
+  vector<int> seqStartPos;
+  vector<int> subSeqStartPos;
+  vector<vector<real>> starts;
+  vector<vector<real>> ends;
+  std::vector<bool> mode = {false};
+#ifndef PADDLE_ONLY_CPU
+  mode.push_back(true);
+#endif
+  genSeqInfo(seqStartPos, subSeqStartPos);
+  for (bool hasSubseq : {true, false}) {
+    LOG(INFO) << "hasSubSeq : " << hasSubseq;
+    genTestData(seqStartPos, subSeqStartPos, starts, ends, hasSubseq);
+    for (bool useGpu : mode) {
+      vector<vector<real>> tmp;
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, tmp, ends);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, tmp);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, ends);
+    }
+  }
+}
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -19,8 +19,13 @@ limitations under the License. */
 #include <memory>     // for unique_ptr
 #include <mutex>      // for call_once
+#include "glog/logging.h"
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/gpu_info.h"
+DECLARE_double(fraction_of_gpu_memory_to_use);
 namespace paddle {
 namespace memory {
@@ -80,6 +85,11 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
                                                 platform::GpuMinChunkSize(),
                                                 platform::GpuMaxChunkSize()));
    }
+    VLOG(3) << "\n\nNOTE: each GPU device use "
+            << FLAGS_fraction_of_gpu_memory_to_use * 100 << "% of GPU memory.\n"
+            << "You can set environment variable '"
+            << platform::kEnvFractionGpuMemoryToUse
+            << "' to change the fraction of GPU usage.\n\n";
  });
  platform::SetDeviceId(gpu_id);

--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -14,7 +14,6 @@ limitations under the License. */
 #pragma once
-#include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"
 namespace paddle {

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -43,6 +43,7 @@ endfunction()
 add_subdirectory(math)
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
+op_library(gather_op SRCS gather_op.cc gather_op.cu)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
@@ -68,3 +69,5 @@ op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
    DEPS framework_proto tensor op_registry operator net_op)
 op_library(uniform_random_op
        SRCS uniform_random_op.cc uniform_random_op.cu)
+op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
+op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -39,11 +39,10 @@ class OnehotCrossEntropyGradientOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto X = ctx.Input<Tensor>("X");
-    // TODO(superjom) add enforce here after helper functions ready
+    dX->Resize(X->dims());
-    X_grad->Resize(X->dims());
  }
 };
@@ -70,9 +69,7 @@ namespace ops = paddle::operators;
 REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
            ops::OnehotCrossEntropyOpMaker, onehot_cross_entropy_grad,
            ops::OnehotCrossEntropyGradientOp);
-REGISTER_OP_CPU_KERNEL(
+REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
-    onehot_cross_entropy,
+                       ops::OnehotCrossEntropyOpKernel<float>);
-    ops::OnehotCrossEntropyOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(onehot_cross_entropy_grad,
-REGISTER_OP_CPU_KERNEL(
+                       ops::OnehotCrossEntropyGradientOpKernel<float>);
-    onehot_cross_entropy_grad,
-    ops::OnehotCrossEntropyGradientOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -12,10 +12,122 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#define EIGEN_USE_GPU
+#include "paddle/framework/op_registry.h"
-#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/platform/assert.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+__host__ __device__ T clipping_log(const T x) {
+  PADDLE_ASSERT(std::is_floating_point<T>::value);
+  const T kApproInf = 1e20;
+  T v = log(x);
+  if (v == INFINITY) {
+    return kApproInf;
+  }
+  if (v == -INFINITY) {
+    return -kApproInf;
+  }
+  return v;
+}
+template <typename T>
+__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
+                                   const int N, const int D) {
+  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
+  // CUDA_1D_KERNEL_LOOP(i, N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
+    Y[i] = -clipping_log(X[i * D + label[i]]);
+  }
+}
+// TODO(qingqing): make zero setting an common function.
+template <typename T>
+__global__ void zero(T* X, const int N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    X[i] = 0.0;
+  }
+}
+template <typename T>
+__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
+                                           const int* label, const int N,
+                                           const int D) {
+  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
+  // CUDA_1D_KERNEL_LOOP(i, N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    int idx = i * D + label[i];
+    dX[idx] = -dY[i] / X[idx];
+  }
+}
+template <typename T>
+class OnehotCrossEntropyOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    auto X = ctx.Input<Tensor>("X");
+    const T* Xdata = X->data<T>();
+    const int* label_data = ctx.Input<Tensor>("label")->data<int>();
+    auto Y = ctx.Output<Tensor>("Y");
+    Y->mutable_data<T>(ctx.GetPlace());
+    T* Ydata = Y->data<T>();
+    int N = X->dims()[0];
+    int D = X->dims()[1];
+    int block = 512;
+    int grid = (N + block - 1) / block;
+    // TODO(qingqing) launch kernel on specified stream
+    // base on ExecutionContext.
+    CrossEntropyKernel<T><<<grid, block>>>(Ydata, Xdata, label_data, N, D);
+  }
+};
+template <typename T>
+class OnehotCrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    auto X = ctx.Input<Tensor>("X");
+    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto label = ctx.Input<Tensor>("label");
+    auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace());
+    auto* dYdata = dY->template data<T>();
+    auto* Xdata = X->template data<T>();
+    auto* label_data = label->data<int>();
+    int N = X->dims()[0];
+    int D = X->dims()[1];
+    int block = 512;
+    int grid = (N * D + block - 1) / block;
+    zero<T><<<grid, block>>>(dXdata, N * D);
+    grid = (N + block - 1) / block;
+    // TODO(qingqing): launch kernel on specified stream
+    // base on ExecutionContext.
+    CrossEntropyGradientKernel<T><<<grid, block>>>(dXdata, dYdata, Xdata,
+                                                   label_data, N, D);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
-    onehot_cross_entropy,
+                       ops::OnehotCrossEntropyOpCUDAKernel<float>);
-    ops::OnehotCrossEntropyOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(onehot_cross_entropy_grad,
+                       ops::OnehotCrossEntropyGradientOpCUDAKernel<float>);
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -21,7 +21,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 template <typename T>
-T tolerable_value(T x) {
+inline T tolerable_value(const T x) {
  static_assert(std::is_floating_point<T>::value,
                "tolerable_value works only on float, "
                "double and double double.");
@@ -39,10 +39,13 @@ T tolerable_value(T x) {
  return x;
 }
-template <typename Place, typename T>
+template <typename T>
 class OnehotCrossEntropyOpKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
    auto X = ctx.Input<Tensor>("X");
    const T* Xdata = X->data<T>();
    const int* label_data = ctx.Input<Tensor>("label")->data<int>();
@@ -62,10 +65,13 @@ class OnehotCrossEntropyOpKernel : public framework::OpKernel {
  }
 };
-template <typename Place, typename T>
+template <typename T>
 class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
    auto X = ctx.Input<Tensor>("X");
    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
@@ -79,6 +85,8 @@ class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
    const int batch_size = X->dims()[0];
    const int class_num = X->dims()[1];
+    // TODO(qingqing): make zero setting an common function.
+    memset(dXdata, 0, sizeof(T) * batch_size * class_num);
    for (int i = 0; i < batch_size; ++i) {
      int index = i * class_num + label_data[i];
      dXdata[index] = -tolerable_value(dYdata[i] / Xdata[index]);

--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cstring>
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/eigen.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/place.h"
@@ -25,13 +26,13 @@ namespace operators {
 // Implementation of CPU copy
 template <typename T>
-void CPUGather(const T* params, const int* indices, const int slice_size,
+void CPUGather(const T* src, const int* indices, const int slice_size,
               const int index_size, T* output) {
  const size_t slice_bytes = slice_size * sizeof(T);
  for (int i = 0; i < index_size; ++i) {
    int index_ = indices[i];
-    memcpy(output + i * slice_size, params + index_ * slice_size, slice_bytes);
+    memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
  }
 }
@@ -55,7 +56,7 @@ void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
  int index_size = index->dims()[0];
  auto src_dims = src->dims();
-  paddle::framework::DDim output_dims(src_dims);
+  framework::DDim output_dims(src_dims);
  output_dims[0] = index_size;
  // slice size

--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/gather_op.h"
+#include "paddle/framework/ddim.h"
+namespace paddle {
+namespace operators {
+class GatherOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    int batch_size = ctx.Input<Tensor>("Index")->dims()[0];
+    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
+    framework::DDim output_dims(ctx.Input<Tensor>("X")->dims());
+    output_dims[0] = batch_size;
+    ctx.Output<Tensor>("Out")->Resize(output_dims);
+  }
+};
+class GatherGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto X = ctx.Input<Tensor>("X");
+    X_grad->Resize(X->dims());
+  }
+};
+class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GatherOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The source input of gather op");
+    AddInput("Index", "The index input of gather op");
+    AddOutput("Out", "The output of add op");
+    AddComment(R"DOC(
+Gather Operator by selecting from the first axis, 
+Out = X[Index]
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
+            ops::GatherGradOp);
+REGISTER_OP_CPU_KERNEL(gather,
+                       ops::GatherOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    gather_grad,
+    ops::GatherGradientOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/operators/gather_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gather,
+                       ops::GatherOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "gather.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "scatter.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename Place, typename T>
+class GatherOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *X = ctx.Input<Tensor>("X");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Y = ctx.Output<Tensor>("Out");
+    Y->mutable_data<T>(ctx.GetPlace());
+    Gather<T>(ctx.GetPlace(), X, Index, Y);
+  }
+};
+template <typename Place, typename T>
+class GatherGradientOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    dX->mutable_data<T>(ctx.GetPlace());
+    ScatterUpdate<T>(ctx.GetPlace(), dO, Index, dX);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,25 +16,25 @@ namespace paddle {
 namespace operators {
 template <typename T>
-class GaussianRandomKernel : public framework::OpKernel {
+class CPUGaussianRandomKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    float mean = context.op_.GetAttr<float>("mean");
    float std = context.op_.GetAttr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>(0);
+    auto* tensor = context.Output<framework::Tensor>("Out");
    T* data = tensor->mutable_data<T>(context.GetPlace());
-    // TODO(dzh): attribute does not support unsigned int.
+    unsigned int seed =
-    // And we need a global random seed configuration.
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
-    int seed = context.op_.GetAttr<int>("seed");
+    std::minstd_rand engine;
    if (seed == 0) {
      seed = std::random_device()();
    }
-    std::mt19937 g(seed);
+    engine.seed(seed);
-    std::normal_distribution<T> distribution(mean, std);
+    std::normal_distribution<T> dist(mean, std);
    ssize_t size = framework::product(tensor->dims());
-    for (int i = 0; i < size; ++i) {
+    for (ssize_t i = 0; i < size; ++i) {
-      data[i] = distribution(g);
+      data[i] = dist(engine);
    }
  }
 };
@@ -48,7 +45,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>(0);
+    auto* tensor = context.Output<framework::Tensor>("Out");
    auto dims = GetAttr<std::vector<int>>("dims");
    PADDLE_ENFORCE(dims.size() > 0UL,
                   "dims can be one int or array. dims must be set.");
@@ -68,8 +65,8 @@ Use to initialize tensor with gaussian random generator.
 )DOC");
    AddAttr<std::vector<int>>("dims", "The dimension of random tensor.");
-    AddAttr<float>("mean", "mean value of random.").SetDefault(.0f);
+    AddAttr<float>("mean", "mean of random tensor.").SetDefault(.0f);
-    AddAttr<float>("std", "minimum value of random value.").SetDefault(1.0f);
+    AddAttr<float>("std", "std of random tensor.").SetDefault(1.0f);
    AddAttr<int>("seed",
                 "Random seed of generator."
                 "0 means use system wide seed")
@@ -83,4 +80,4 @@ Use to initialize tensor with gaussian random generator.
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
                             ops::GaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
+REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>);
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include <memory>
+#include <thrust/device_ptr.h>
-#include <random>
+#include <thrust/iterator/counting_iterator.h>
-#include "paddle/platform/dynload/curand.h"
+#include <thrust/random.h>
-#include "paddle/platform/gpu_info.h"
+#include <thrust/transform.h>
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
 namespace paddle {
 namespace operators {
 template <typename T>
-class GaussianRandomKernel : public framework::OpKernel {
+struct GaussianGenerator {
+  T mean_, std_;
+  unsigned int seed_;
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
+      : mean_(mean), std_(std), seed_(seed) {}
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::normal_distribution<T> dist(mean_, std_);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+template <typename T>
+class GPUGaussianRandomKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.op_.GetAttr<float>("mean");
+    auto* tensor = context.Output<framework::Tensor>("Out");
-    float std = context.op_.GetAttr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>(0);
    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed =
-    int seed = context.op_.GetAttr<int>("seed");
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
    if (seed == 0) {
      std::random_device rd;
      seed = rd();
    }
-    curandGenerator_t g;
+    T mean = static_cast<T>(context.op_.GetAttr<float>("mean"));
-    PADDLE_ENFORCE(platform::dynload::curandCreateGenerator(
+    T std = static_cast<T>(context.op_.GetAttr<float>("std"));
-        &g, CURAND_RNG_PSEUDO_DEFAULT));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    PADDLE_ENFORCE(
+    ssize_t N = framework::product(tensor->dims());
-        platform::dynload::curandSetPseudoRandomGeneratorSeed(g, seed));
+    thrust::transform(index_sequence_begin, index_sequence_begin + N,
-    platform::dynload::curandGenerateNormal(
+                      thrust::device_ptr<T>(data),
-        g, data, framework::product(tensor->dims()), mean, std);
+                      GaussianGenerator<T>(mean, std, seed));
  }
 };
 }  // namespace operators
 }  // namespace paddle
-namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gaussian_random,
-REGISTER_OP_GPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
+                       paddle::operators::GPUGaussianRandomKernel<float>);
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -25,8 +25,8 @@ void gemm<platform::CPUPlace, float>(const CBLAS_TRANSPOSE transA,
                                     const float alpha, const float* A,
                                     const float* B, const float beta, float* C,
                                     platform::DeviceContext* context) {
-  int lda = K;
+  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = N;
+  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
              beta, C, ldc);
@@ -40,8 +40,8 @@ void gemm<platform::CPUPlace, double>(const CBLAS_TRANSPOSE transA,
                                      const double* B, const double beta,
                                      double* C,
                                      platform::DeviceContext* context) {
-  int lda = K;
+  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = N;
+  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
              beta, C, ldc);

--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/minus_op.h"
+#include "paddle/operators/net_op.h"
+namespace paddle {
+namespace operators {
+class MinusOp : public framework::OperatorWithKernel {
+ public:
+  MinusOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto *left_tensor = ctx.Input<framework::Tensor>("X");
+    auto *right_tensor = ctx.Input<framework::Tensor>("Y");
+    PADDLE_ENFORCE_EQ(
+        framework::product(left_tensor->dims()),
+        framework::product(right_tensor->dims()),
+        "Minus operator must take two tensor with same num of elements");
+    ctx.Output<framework::Tensor>("Out")->Resize(left_tensor->dims());
+  }
+};
+class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MinusOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The left tensor of minus operator.").NotInGradient();
+    AddInput("Y", "The right tensor of minus operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of minus operator.").NotInGradient();
+    AddComment(R"DOC(Minus Operator
+Equation: Out = X - Y
+)DOC");
+  }
+};
+template <typename AttrType>
+class MinusGradOp : public NetOp {
+ public:
+  MinusGradOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    auto out_grad = Input(framework::GradVarName("Out"));
+    auto x_grad = Output(framework::GradVarName("X"));
+    auto y_grad = Output(framework::GradVarName("Y"));
+    // x_grad = out_grad
+    AppendOp(framework::OpRegistry::CreateOp("identity", {{"X", {out_grad}}},
+                                             {{"Out", {x_grad}}}, {}));
+    framework::AttributeMap scale_attr;
+    scale_attr["scale"] = static_cast<AttrType>(-1);
+    AppendOp(framework::OpRegistry::CreateOp("scale", {{"X", {out_grad}}},
+                                             {{"Out", {y_grad}}}, scale_attr));
+    CompleteAddOp(false);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+USE_OP(scale);
+USE_OP_ITSELF(identity);
+namespace ops = paddle::operators;
+REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, minus_grad,
+            ops::MinusGradOp<float>);
+REGISTER_OP_CPU_KERNEL(minus,
+                       ops::MinusKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/minus_op.cu
+++ b/paddle/operators/minus_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/minus_op.h"
+REGISTER_OP_GPU_KERNEL(
+    minus, paddle::operators::MinusKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename Place, typename T>
+class MinusKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* left_tensor = context.Input<framework::Tensor>("X");
+    auto* right_tensor = context.Input<framework::Tensor>("Y");
+    auto* out_tensor = context.Output<framework::Tensor>("Out");
+    out_tensor->mutable_data<T>(context.GetPlace());
+    auto& dev = context.GetEigenDevice<Place>();
+    framework::EigenVector<T>::Flatten(*out_tensor).device(dev) =
+        framework::EigenVector<T>::Flatten(*left_tensor) -
+        framework::EigenVector<T>::Flatten(*right_tensor);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -13,11 +13,12 @@
   limitations under the License. */
 #include "paddle/operators/mul_op.h"
-#include "paddle/operators/math/math_function.h"
 namespace paddle {
 namespace operators {
+using framework::Tensor;
 class MulOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -59,10 +60,23 @@ class MulOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {}
+  void InferShape(const framework::InferShapeContext &ctx) const override {
-  std::string DebugString() const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    LOG(INFO) << "MulGrad";
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
-    return "";
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    PADDLE_ENFORCE(x_dims[0] == out_dims[0],
+                   "Out@GRAD M X N must equal to X dims 0, M ");
+    PADDLE_ENFORCE(y_dims[1] == out_dims[1],
+                   "Out@GRAD M X N must equal to Y dims 1, N ");
+    x_grad->Resize(x_dims);
+    y_grad->Resize(y_dims);
  }
 };
@@ -72,3 +86,5 @@ class MulOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
 REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mul_grad,
+                       ops::MulGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -17,3 +17,5 @@
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mul_grad,
+                       ops::MulGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -31,18 +31,34 @@ template <typename Place, typename T>
 class MulKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
+    auto* X = context.Input<Tensor>("X");
-        {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};
+    auto* Y = context.Input<Tensor>("Y");
-    auto* input0 = context.Input<Tensor>("X");
+    auto* Z = context.Output<Tensor>("Out");
-    auto* input1 = context.Input<Tensor>("Y");
+    Z->mutable_data<T>(context.GetPlace());
-    auto* output = context.Output<Tensor>("Out");
+    auto* device_context =
-    output->mutable_data<T>(context.GetPlace());
+        const_cast<platform::DeviceContext*>(context.device_context_);
-    auto X = EigenMatrix<T>::From(*input0);
+    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
-    auto Y = EigenMatrix<T>::From(*input1);
+  }
-    auto Z = EigenMatrix<T>::From(*output);
+};
-    auto& place = context.GetEigenDevice<Place>();
+template <typename Place, typename T>
-    Z.device(place) = X.contract(Y, dim_pair);
+class MulGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    dX->mutable_data<T>(ctx.GetPlace());
+    dY->mutable_data<T>(ctx.GetPlace());
+    auto* device_context =
+        const_cast<platform::DeviceContext*>(ctx.device_context_);
+    // dX = dOut * Y'. dX: M x K, dOut : M x N, Y : K x N
+    math::matmul<Place, T>(*dOut, false, *Y, true, 1, dX, 0, device_context);
+    // dY = X' * dOut. dY: K x N, dOut : M x N, X : M x K
+    math::matmul<Place, T>(*X, true, *dOut, false, 1, dY, 0, device_context);
  }
 };

--- a/paddle/operators/net_op.cc
+++ b/paddle/operators/net_op.cc
@@ -68,10 +68,15 @@ std::string NetOp::DebugString() const {
 bool NetOp::IsNetOp() const { return true; }
 std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
+  std::vector<std::string> all;
+  for (auto& pair : this->outputs_) {
+    for (auto& var_name : pair.second) {
+      all.push_back(var_name);
+    }
+  }
  if (has_intermediate) {
-    return this->outputs_.at(kAll);
+    return all;
  }
-  auto& all = this->outputs_.at(kAll);
  std::vector<std::string> ret_val;
  for (auto& each : all) {
    if (!Contains(intermediate_outputs_, each)) {
@@ -81,9 +86,8 @@ std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
  return ret_val;
 }
-NetOp::NetOp(const std::string& type,
+NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::OperatorBase::VarNameMap& inputs,
+             const framework::VariableNameMap& outputs,
-             const framework::OperatorBase::VarNameMap& outputs,
             const framework::AttributeMap& attrs)
    : framework::OperatorBase(type, inputs, outputs, attrs) {}

--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -38,8 +38,10 @@ class NetOp : public framework::OperatorBase {
 public:
  static const char kAll[];
  NetOp() : framework::OperatorBase("plain_net", {}, {}, {}) {}
-  NetOp(const std::string& type, const VarNameMap& inputs,
-        const VarNameMap& outputs, const framework::AttributeMap& attrs);
+  NetOp(const std::string& type, const framework::VariableNameMap& inputs,
+        const framework::VariableNameMap& outputs,
+        const framework::AttributeMap& attrs);
  NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
    this->ops_.reserve(o.ops_.size());
@@ -84,13 +86,14 @@ class NetOp : public framework::OperatorBase {
    return true;
  }
-  void AddOp(const framework::OperatorBase& op) { AddOp(op.Clone()); }
+  void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
  /**
   * @brief Add an operator by ptr
   */
-  void AddOp(std::unique_ptr<framework::OperatorBase> op) {
+  void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
+    PADDLE_ENFORCE(!add_op_done_,
+                   "Cannot AppendOp when this network is sealed");
    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
    ops_.push_back(std::move(op));
  }

--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -38,10 +38,10 @@ TEST(OpKernel, all) {
  auto net = std::make_shared<NetOp>();
  ASSERT_NE(net, nullptr);
-  net->AddOp(std::unique_ptr<TestOp>(
+  net->AppendOp(std::unique_ptr<TestOp>(
      new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
                 {{"Out", {"y"}}}, {})));
-  net->AddOp(std::unique_ptr<TestOp>(
+  net->AppendOp(std::unique_ptr<TestOp>(
      new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
                 {{"Out", {"z"}}}, {})));
@@ -61,7 +61,7 @@ TEST(NetOp, insert_op) {
  auto op1 = std::unique_ptr<framework::NOP>(
      new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
                         {{"Out", {"y"}}}, {}));
-  net.AddOp(*op1);
+  net.AppendOp(*op1);
  net.InsertOp(0, *op1);
  ASSERT_EQ(2UL, net.ops_.size());
  net.InsertOp(2, std::move(op1));
@@ -70,16 +70,16 @@ TEST(NetOp, insert_op) {
 TEST(NetOp, Clone) {
  NetOp net;
-  net.AddOp(
+  net.AppendOp(
      std::unique_ptr<framework::NOP>(new framework::NOP{"empty", {}, {}, {}}));
-  net.AddOp(std::unique_ptr<framework::NOP>(
+  net.AppendOp(std::unique_ptr<framework::NOP>(
      new framework::NOP{"empty2", {}, {}, {}}));
  net.CompleteAddOp(true);
  auto new_net_op = net.Clone();
  ASSERT_NE(new_net_op, nullptr);
  ASSERT_TRUE(new_net_op->IsNetOp());
  auto* new_net = static_cast<NetOp*>(new_net_op.get());
-  ASSERT_EQ(2, new_net->ops_.size());
+  ASSERT_EQ(2UL, new_net->ops_.size());
  ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
  ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
 }

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -131,8 +131,8 @@ const rnn::ArgumentName RecurrentGradientOp::kArgName{
    "memories",    "pre_memories", "boot_memories@grad"};
 RecurrentOp::RecurrentOp(const std::string& type,
-                         const framework::OperatorBase::VarNameMap& inputs,
+                         const framework::VariableNameMap& inputs,
-                         const framework::OperatorBase::VarNameMap& outputs,
+                         const framework::VariableNameMap& outputs,
                         const framework::AttributeMap& attrs)
    : OperatorBase(type, inputs, outputs, attrs) {
  rnn::InitArgument(kArgName, &arg_, *this);
@@ -223,8 +223,8 @@ void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
 }
 RecurrentGradientOp::RecurrentGradientOp(
-    const std::string& type, const framework::OperatorBase::VarNameMap& inputs,
+    const std::string& type, const framework::VariableNameMap& inputs,
-    const framework::OperatorBase::VarNameMap& outputs,
+    const framework::VariableNameMap& outputs,
    const framework::AttributeMap& attrs)
    : OperatorBase(type, inputs, outputs, attrs) {
  rnn::InitArgument(kArgName, &arg_, *this);

--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -114,8 +114,9 @@ class RecurrentGradientAlgorithm {
 class RecurrentOp : public framework::OperatorBase {
 public:
-  RecurrentOp(const std::string& type, const VarNameMap& inputs,
+  RecurrentOp(const std::string& type, const framework::VariableNameMap& inputs,
-              const VarNameMap& outputs, const framework::AttributeMap& attrs);
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs);
  RecurrentOp(const RecurrentOp& o)
      : framework::OperatorBase(
@@ -150,8 +151,9 @@ class RecurrentOp : public framework::OperatorBase {
 class RecurrentGradientOp : public framework::OperatorBase {
 public:
-  RecurrentGradientOp(const std::string& type, const VarNameMap& inputs,
+  RecurrentGradientOp(const std::string& type,
-                      const VarNameMap& outputs,
+                      const framework::VariableNameMap& inputs,
+                      const framework::VariableNameMap& outputs,
                      const framework::AttributeMap& attrs);
  RecurrentGradientOp(const RecurrentGradientOp& o)

--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -17,7 +17,9 @@
 namespace paddle {
 namespace operators {
-class RowWiseAddOp : public framework::OperatorWithKernel {
+using framework::Tensor;
+class RowwiseAddOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -34,9 +36,9 @@ class RowWiseAddOp : public framework::OperatorWithKernel {
  }
 };
-class RowWiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
+class RowwiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RowWiseAddOpMaker(framework::OpProto *proto,
+  RowwiseAddOpMaker(framework::OpProto *proto,
                    framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The left input of row-wise add op, must be matrix");
@@ -49,12 +51,32 @@ for i in xrange(X.shape[0]):
 )DOC");
  }
 };
+class RowwiseAddGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "X should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("b"), "b should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null");
+    auto dims0 = ctx.Input<Tensor>("X")->dims();
+    auto dims1 = ctx.Input<Tensor>("b")->dims();
+    PADDLE_ENFORCE_EQ(1, dims1.size(), "b dims should be 1")
+    ctx.Output<Tensor>(framework::GradVarName("X"))->Resize(dims0);
+    ctx.Output<Tensor>(framework::GradVarName("b"))->Resize(dims1);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(rowwise_add, ops::RowWiseAddOp,
+REGISTER_OP(rowwise_add, ops::RowwiseAddOp, ops::RowwiseAddOpMaker,
-                             ops::RowWiseAddOpMaker);
+            rowwise_add_grad, ops::RowwiseAddGradOp);
+REGISTER_OP_CPU_KERNEL(
+    rowwise_add, ops::RowwiseAddKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    rowwise_add, ops::RowWiseAddKernel<paddle::platform::CPUPlace, float>);
+    rowwise_add_grad,
+    ops::RowwiseAddGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -17,4 +17,7 @@
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    rowwise_add, ops::RowWiseAddKernel<paddle::platform::GPUPlace, float>);
+    rowwise_add, ops::RowwiseAddKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    rowwise_add_grad,
+    ops::RowwiseAddGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class RowWiseAddKernel : public framework::OpKernel {
+class RowwiseAddKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto out = context.Output<Tensor>("Out");
@@ -47,5 +47,25 @@ class RowWiseAddKernel : public framework::OpKernel {
  }
 };
+template <typename Place, typename T>
+class RowwiseAddGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* db = context.Output<Tensor>(framework::GradVarName("b"));
+    dX->mutable_data<T>(context.GetPlace());
+    db->mutable_data<T>(context.GetPlace());
+    auto OutGrad = EigenMatrix<T>::From(*dOut);
+    auto place = context.GetEigenDevice<Place>();
+    EigenMatrix<T>::From(*dX).device(place) = OutGrad;
+    // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
+    // colwise add
+    Eigen::array<int, 1> dims{{0}}; /* dimension to reduce */
+    EigenVector<T>::Flatten(*db).device(place) = OutGrad.sum(dims);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/scale_op.h"
+#include "paddle/operators/net_op.h"
+namespace paddle {
+namespace operators {
+class ScaleOp : public framework::OperatorWithKernel {
+ public:
+  ScaleOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto *in = ctx.Input<framework::Tensor>("X");
+    auto *out = ctx.Output<framework::Tensor>("Out");
+    out->Resize(in->dims());
+  }
+};
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddComment(R"DOC(Scale operator
+The equation is: Out = scale*X
+)DOC");
+    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+  }
+};
+// Identity Op's gradient is identity op, too.
+// Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
+template <typename AttrType>
+class ScaleGradOp : public NetOp {
+ public:
+  ScaleGradOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    AppendOp(framework::OpRegistry::CreateOp(
+        "scale", {{"X", {Input(framework::GradVarName("Out"))}}},
+        {{"Out", {Output(framework::GradVarName("X"))}}},
+        {{"scale", GetAttr<AttrType>("scale")}}));
+    CompleteAddOp(false);
+  }
+};
+// identity is a alias of scale op. This is also a example for creating a alias
+// operator.
+template <typename AttrType>
+class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IdentityOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input tensor of identity op");
+    AddOutput("Out", "output tensor of identity op");
+    AddComment("identity operator. Just a alias of scale op which scale = 1.0");
+  }
+};
+template <typename AttrType>
+class IdentityOp : public NetOp {
+ public:
+  IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    AppendOp(framework::OpRegistry::CreateOp(
+        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
+        {{"scale", static_cast<AttrType>(1)}}));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, scale_grad,
+            ops::ScaleGradOp<float>);
+REGISTER_OP_CPU_KERNEL(scale,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
+                             ops::IdentityOpMaker<float>);
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/scale_op.h"
+REGISTER_OP_GPU_KERNEL(
+    scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename Place, typename T, typename AttrType = T>
+class ScaleKernel : public framework::OpKernel {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    tensor->mutable_data<T>(in->place());
+    auto scale = static_cast<T>(context.op_.GetAttr<AttrType>("scale"));
+    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& dev = context.GetEigenDevice<Place>();
+    eigen_out.device(dev) = scale * eigen_in;
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -39,7 +36,8 @@ class CPUUniformRandomKernel : public framework::OpKernel {
    std::uniform_real_distribution<T> dist(
        static_cast<T>(context.op_.GetAttr<float>("min")),
        static_cast<T>(context.op_.GetAttr<float>("max")));
-    for (ssize_t i = 0; i < framework::product(tensor->dims()); ++i) {
+    ssize_t size = framework::product(tensor->dims());
+    for (ssize_t i = 0; i < size; ++i) {
      data[i] = dist(engine);
    }
  }
@@ -66,7 +64,6 @@ class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddOutput("Out", "The output tensor of uniform random op");
    AddComment(R"DOC(Uniform random operator.
 Used to initialize tensor with uniform random generator.
 )DOC");
    AddAttr<std::vector<int>>("dims", "the dimension of random tensor");

--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -276,17 +276,21 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
 void Argument::concat(const std::vector<Argument>& args,
                      const std::vector<int>& selectRows,
                      const std::vector<int>& seqStartPos,
+                      const std::vector<int>& copySize,
                      bool useGpu,
                      hl_stream_t stream,
                      PassType passType) {
  CHECK(!subSequenceStartPositions)
      << "undefined behavior for subsequence positions";
-  size_t batchSize = selectRows.size();
+  size_t batchSize = 0;
+  for (size_t i = 0; i < copySize.size(); ++i)
+    batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
  auto copyArg = [batchSize, stream](MatrixPtr& dst,
                                     MatrixPtr src,
-                                     int startRow,
+                                     int desStartRow,
-                                     int pos,
+                                     int srcStartRow,
                                     int size,
                                     bool useGpu) {
    if (!src) {
@@ -300,14 +304,14 @@ void Argument::concat(const std::vector<Argument>& args,
      dst->resize(batchSize, width);
    }
-    MatrixPtr tmpMatrix = dst->subMatrix(startRow, size);
+    MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
-    tmpMatrix->copyFrom(*src->subMatrix(pos, size), stream);
+    tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
  };
  auto copyIds = [batchSize, stream](IVectorPtr& dst,
                                     const IVectorPtr& src,
-                                     int startRow,
+                                     int desStartRow,
-                                     int pos,
+                                     int srcStartRow,
                                     int size,
                                     bool useGpu) {
    if (!src) {
@@ -315,13 +319,14 @@ void Argument::concat(const std::vector<Argument>& args,
      return;
    }
    IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(startRow, size)->copyFrom(*src->subVec(pos, size), stream);
+    dst->subVec(desStartRow, size)
+        ->copyFrom(*src->subVec(srcStartRow, size), stream);
  };
  auto copyStrs = [batchSize, stream](SVectorPtr& dst,
                                      const SVectorPtr& src,
-                                      int startRow,
+                                      int desStartRow,
-                                      int pos,
+                                      int srcStartRow,
                                      int size,
                                      bool useGpu) {
    if (!src) {
@@ -333,30 +338,31 @@ void Argument::concat(const std::vector<Argument>& args,
    } else {
      dst->resize(batchSize);
    }
-    std::copy(
+    std::copy(src->begin() + srcStartRow,
-        src->begin() + pos, src->begin() + pos + size, dst->begin() + startRow);
+              src->begin() + srcStartRow + size,
+              dst->begin() + desStartRow);
  };
  dataId = args[0].dataId;
  CHECK_NE(seqStartPos.size(), 0UL);
-  size_t sampleNum = seqStartPos.size() - 1;
+  int desStartRow = 0;
-  for (size_t i = 0; i < sampleNum; ++i) {
+  for (size_t i = 0; i < copySize.size(); ++i) {
    int startPos = seqStartPos[i];
    int endPos = seqStartPos[i + 1];
    CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
    for (int j = startPos; j < endPos; ++j) {
      const Argument& arg = args[j - startPos];
-      CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
+      CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
-                                   << " same dataId";
+                                   << "the same dataId.";
-      const int copySize = 1;
+      const int srcStartRow = selectRows[j];
-      const int rowIdx = selectRows[j];
+      copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
-      copyArg(in, arg.in, j, rowIdx, copySize, useGpu);
+      copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
-      copyArg(value, arg.value, j, rowIdx, copySize, useGpu);
      if (passType != PASS_TEST) {
-        copyArg(grad, arg.grad, j, rowIdx, copySize, useGpu);
+        copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
      }
-      copyIds(ids, arg.ids, j, rowIdx, copySize, useGpu);
+      copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
-      copyStrs(strs, arg.strs, j, rowIdx, copySize, useGpu);
+      copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
+      desStartRow += copySize[i];
    }
  }
  ICpuGpuVector::resizeOrCreate(
@@ -670,19 +676,28 @@ void Argument::reorganizeSeqInfo(
    const ICpuGpuVectorPtr seqStartPos,
    const ICpuGpuVectorPtr subSeqStartPos,
    std::vector<std::vector<int>>& reorganizedSeqInfo) {
-  int* seqStarts = seqStartPos->getMutableData(false);
+  CHECK(seqStartPos);
-  int* subSeqStarts = subSeqStartPos->getMutableData(false);
  int seqNum = seqStartPos->getSize() - 1;
-  reorganizedSeqInfo.resize(seqNum, std::vector<int>());
+  int* seqStarts = seqStartPos->getMutableData(false);
-  int seqIdx = 0;
-  for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
+  if (subSeqStartPos) {
-    reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+    int* subSeqStarts = subSeqStartPos->getMutableData(false);
-    if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
+    reorganizedSeqInfo.resize(seqNum, std::vector<int>());
-      seqIdx++;
+    int seqIdx = 0;
-      if (seqIdx == seqNum) return;
+    for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+      if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
+        seqIdx++;
+        if (seqIdx == seqNum) return;
+        reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+      }
    }
+  } else {
+    reorganizedSeqInfo.resize(1, std::vector<int>(seqNum + 1, 0));
+    memcpy(reorganizedSeqInfo[0].data(),
+           seqStarts,
+           sizeof(int) * seqStartPos->getSize());
  }
 }

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -240,6 +240,7 @@ struct Argument {
  void concat(const std::vector<Argument>& args,
              const std::vector<int>& selectRows,
              const std::vector<int>& seqStartPos,
+              const std::vector<int>& copySize,
              bool useGpu,
              hl_stream_t stream,
              PassType passType);

--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -65,7 +65,10 @@ public:
  size_t getSize() const { return config_.size(); }
  bool isFullSize() const {
-    return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
+    if (bufs_[PARAMETER_VALUE]) {
+      return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
+    }
+    return false;
  }
  inline bool useGpu() const { return useGpu_; }

--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
 cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog)
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
-nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
@@ -9,6 +9,7 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 add_subdirectory(dynload)
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece)
+cc_test(environment_test SRCS environment_test.cc DEPS stringpiece)
 IF(WITH_GPU)
    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)

--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -114,9 +114,6 @@ CUDADeviceContext::~CUDADeviceContext() {
    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
  }
-  if (curand_generator_) {
-    PADDLE_ENFORCE(dynload::curandDestroyGenerator(curand_generator_));
-  }
  eigen_stream_.reset();
  eigen_device_.reset();
  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
@@ -152,19 +149,6 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() {
 cudaStream_t CUDADeviceContext::stream() { return stream_; }
-curandGenerator_t CUDADeviceContext::curand_generator() {
-  if (!curand_generator_) {
-    SetDeviceId(place_.device);
-    PADDLE_ENFORCE(dynload::curandCreateGenerator(&curand_generator_,
-                                                  CURAND_RNG_PSEUDO_DEFAULT));
-    PADDLE_ENFORCE(
-        dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_));
-    PADDLE_ENFORCE(dynload::curandSetStream(curand_generator_, stream_));
-  }
-  return curand_generator_;
-}
 #endif  // PADDLE_ONLY_CPU
 }  // namespace platform

--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #ifndef PADDLE_ONLY_CPU
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
-#include "paddle/platform/dynload/curand.h"
 #include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
@@ -40,7 +39,7 @@ class DeviceContext {
 class CPUDeviceContext : public DeviceContext {
 public:
  CPUDeviceContext();
-  explicit CPUDeviceContext(CPUPlace);
+  explicit CPUDeviceContext(CPUPlace place);
  virtual ~CPUDeviceContext() {}
  Eigen::DefaultDevice* eigen_device() const;
@@ -56,7 +55,7 @@ class EigenCudaStreamDevice;
 class CUDADeviceContext : public DeviceContext {
 public:
-  explicit CUDADeviceContext(GPUPlace);
+  explicit CUDADeviceContext(GPUPlace place);
  virtual ~CUDADeviceContext();
  /*! \brief  Wait for all operations completion in the stream. */
@@ -75,9 +74,6 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Return cudnn  handle in the device context. */
  cudnnHandle_t     cudnn_handle();
-  /*! \brief  Return curand handle in the device context. */
-  curandGenerator_t curand_generator();
  /*! \brief  Return cuda stream in the device context. */
  cudaStream_t      stream();
  // clang-format on
@@ -85,18 +81,13 @@ class CUDADeviceContext : public DeviceContext {
 private:
  GPUPlace place_;
- private:
  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
- private:
-  uint64_t seed_;
  // clang-format off
  cudaStream_t       stream_{nullptr};
  cudnnHandle_t      cudnn_handle_{nullptr};
  cublasHandle_t     cublas_handle_{nullptr};
-  curandGenerator_t  curand_generator_{nullptr};
  // clang-format on
 };

--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -43,8 +43,6 @@ TEST(Device, CUDADeviceContext) {
    ASSERT_NE(nullptr, cudnn_handle);
    cublasHandle_t cublas_handle = device_context->cublas_handle();
    ASSERT_NE(nullptr, cublas_handle);
-    curandGenerator_t curand_handle = device_context->curand_generator();
-    ASSERT_NE(nullptr, curand_handle);
    ASSERT_NE(nullptr, device_context->stream());
    delete device_context;
  }

--- a/paddle/platform/environment.h
+++ b/paddle/platform/environment.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdlib.h>
+#include <unistd.h>
+#include <vector>
+#include "paddle/platform/enforce.h"
+#include "paddle/string/piece.h"
+extern char** environ;  // for environment variables
+namespace paddle {
+namespace platform {
+inline void SetEnvVariable(const std::string& name, const std::string& value) {
+  PADDLE_ENFORCE_NE(setenv(name.c_str(), value.c_str(), 1), -1,
+                    "Failed to set environment variable %s=%s", name, value);
+}
+inline void UnsetEnvVariable(const std::string& name) {
+  PADDLE_ENFORCE_NE(unsetenv(name.c_str()), -1,
+                    "Failed to unset environment variable %s", name);
+}
+inline bool IsEnvVarDefined(const std::string& name) {
+  return std::getenv(name.c_str()) != nullptr;
+}
+inline std::string GetEnvValue(const std::string& name) {
+  PADDLE_ENFORCE(IsEnvVarDefined(name),
+                 "Tried to access undefined environment variable %s", name);
+  return std::getenv(name.c_str());
+}
+inline std::vector<std::string> GetAllEnvVariables() {
+  std::vector<std::string> vars;
+  for (auto var = environ; *var != nullptr; ++var) {
+    auto tail = string::Index(*var, "=");
+    auto name = string::SubStr(*var, 0, tail).ToString();
+    vars.push_back(name);
+  }
+  return vars;
+}
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/environment_test.cc
+++ b/paddle/platform/environment_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/platform/environment.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+TEST(ENVIRONMENT, ACCESS) {
+  namespace platform = paddle::platform;
+  namespace string = paddle::string;
+  platform::SetEnvVariable("PADDLE_USE_ENV", "TRUE");
+  EXPECT_TRUE(platform::IsEnvVarDefined("PADDLE_USE_ENV"));
+  EXPECT_EQ(platform::GetEnvValue("PADDLE_USE_ENV"), "TRUE");
+  platform::UnsetEnvVariable("PADDLE_USE_ENV");
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV"));
+  platform::SetEnvVariable("PADDLE_USE_ENV1", "Hello ");
+  platform::SetEnvVariable("PADDLE_USE_ENV2", "World, ");
+  platform::SetEnvVariable("PADDLE_USE_ENV3", "PaddlePaddle!");
+  std::string env_info;
+  auto vars = platform::GetAllEnvVariables();
+  for_each(vars.begin(), vars.end(), [&](const std::string& var) {
+    env_info += platform::GetEnvValue(var);
+  });
+  EXPECT_TRUE(string::Contains(env_info, "Hello World, PaddlePaddle!"));
+  platform::UnsetEnvVariable("PADDLE_USE_ENV1");
+  platform::UnsetEnvVariable("PADDLE_USE_ENV2");
+  platform::UnsetEnvVariable("PADDLE_USE_ENV3");
+  env_info.clear();
+  vars = platform::GetAllEnvVariables();
+  for_each(vars.begin(), vars.end(), [&](const std::string& var) {
+    env_info += platform::GetEnvValue(var);
+  });
+  EXPECT_FALSE(string::Contains(env_info, "Hello World, PaddlePaddle!"));
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV1"));
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV2"));
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV3"));
+}
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #include "gflags/gflags.h"
 #include "paddle/platform/enforce.h"
+#include "paddle/platform/environment.h"
 DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
              "Default use 95% of GPU memory for PaddlePaddle,"
@@ -70,6 +73,13 @@ size_t GpuMaxChunkSize() {
  GpuMemoryUsage(available, total);
+  if (IsEnvVarDefined(kEnvFractionGpuMemoryToUse)) {
+    auto val = std::stod(GetEnvValue(kEnvFractionGpuMemoryToUse));
+    PADDLE_ENFORCE_GT(val, 0.0);
+    PADDLE_ENFORCE_LE(val, 1.0);
+    FLAGS_fraction_of_gpu_memory_to_use = val;
+  }
  // Reserving the rest memory for page tables, etc.
  size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;

--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -18,10 +18,15 @@ limitations under the License. */
 #include <cuda_runtime.h>
 #include <stddef.h>
+#include <string>
 namespace paddle {
 namespace platform {
+//! Environment variable: fraction of GPU memory to use on each device.
+const std::string kEnvFractionGpuMemoryToUse =
+    "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
 //! Get the total number of GPU devices in system.
 int GetDeviceCount();

--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -65,7 +65,6 @@ void ParameterClient2::initThreads() {
    LOG(INFO) << "parallel_thread_num dosent need to set";
  }
  syncThreadPool_.reset(new SyncThreadPool(threadNum_));
  startThreads();
 }
@@ -224,6 +223,14 @@ void ParameterClient2::prepareSendData(
    request.set_cost(cost);
    request.set_batch_status(batchStatus);
    CHECK_EQ(request.blocks_size(), 0);
+    VLOG(10) << "request: trainer_id: " << request.trainer_id()
+             << " update_mode" << request.update_mode()
+             << " send_back_parameter: " << request.send_back_parameter()
+             << " send_back_parameter_type: "
+             << request.send_back_parameter_type()
+             << " num_samples: " << request.num_samples()
+             << " cost: " << request.cost()
+             << " batch_status: " << request.batch_status();
  }
  for (const auto& segments : parameterSegments) {
    const auto it = parameterMap_.find(segments.id);
@@ -251,11 +258,17 @@ void ParameterClient2::prepareSendData(
      CHECK(sendMat != nullptr) << "sendMat is nullptr";
      syncThreadPool_->exec([&](int tid, size_t numThreads) {
+        std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
        const auto& localIndices = prefetchMat->getLocalIndices();
        /// num of sparse rows
        size_t nLocalBlocks = localIndices.size();
        uint64_t beginDim = 0;
        uint64_t endDim = 0;
+        // FIXME(typhoonzero): let it resize first
+        prefetchMat->getLocalRow(nLocalBlocks + 1);
+        sendMat->getLocalRow(nLocalBlocks + 1);
        for (size_t row = 0; row < nLocalBlocks; ++row) {
          int64_t blockId = localIndices[row];  // local row -> sparse row
          int serverId = std::abs((blockId + nameHash) % serviceNum_);
@@ -275,7 +288,6 @@ void ParameterClient2::prepareSendData(
          block->set_begin_pos(row * blockSize);
          /// block len
          block->set_block_size(endDim - beginDim);
          if (sendingPara) {
            sendJob->parallelInputIovs[serverId].push_back(
                {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});

--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -583,6 +583,7 @@ protected:
 #ifndef PADDLE_DISABLE_TIMER
  uint64_t forwardbackwordTime_;
 #endif
+  std::mutex sparseAutoGrowthMutex_;
  /// map id to parameter used for decoding protobuf data
  std::unordered_map<size_t, ParameterPtr> parameterMap_;

--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
+if(WITH_PYTHON)
+cc_library(paddle_pybind SHARED
+    SRCS pybind.cc
+    DEPS pybind python backward
+    sgd_op
+    gather_op
+    add_op
+    mul_op
+    rowwise_add_op
+    sigmoid_op
+    softmax_op
+    mean_op
+    cross_entropy_op
+    recurrent_op
+    uniform_random_op
+    gaussian_random_op
+    fill_zeros_like_op
+    scale_op
+    minus_op)
+endif(WITH_PYTHON)
--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@@ -18,11 +18,11 @@ limitations under the License. */
 #include "paddle/framework/backward.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/tensor_py.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "paddle/pybind/tensor_py.h"
 #include "paddle/string/to_string.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
@@ -31,7 +31,7 @@ limitations under the License. */
 namespace py = pybind11;
 USE_OP(add_two);
-USE_CPU_ONLY_OP(onehot_cross_entropy);
+USE_OP(onehot_cross_entropy);
 USE_OP(sgd);
 USE_OP(mul);
 USE_OP(mean);
@@ -42,6 +42,10 @@ USE_OP(fill_zeros_like);
 USE_OP_ITSELF(recurrent_op);
 USE_OP(gaussian_random);
 USE_OP(uniform_random);
+USE_OP(scale);
+USE_OP_ITSELF(identity);
+USE_OP(minus);
+USE_CPU_ONLY_OP(gather);
 namespace paddle {
 namespace framework {
@@ -131,26 +135,24 @@ All parameter, weight, gradient are variables in Paddle.
           py::return_value_policy::reference)
      .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
      .def(py::init<>())
-      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
+      .def("new_scope",
+           [](Scope &self) -> Scope * { return &self.NewScope(); },
           py::return_value_policy::reference)
      .def("drop_kids", &Scope::DropKids);
  //! @note: Be careful! PyBind will return std::string as an unicode, not
  //! Python str. If you want a str object, you should cast them in Python.
  m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
-    auto &op_info_map = OpRegistry::op_info_map();
    std::vector<py::bytes> ret_values;
-    for (auto it = op_info_map.begin(); it != op_info_map.end(); ++it) {
-      const OpProto *proto = it->second.proto_;
+    OpInfoMap::Instance().IterAllInfo([&ret_values](const std::string &type,
-      if (proto == nullptr) {
+                                                    const OpInfo &info) {
-        continue;
+      if (!info.HasOpProtoAndChecker()) return;
-      }
-      PADDLE_ENFORCE(proto->IsInitialized(), "OpProto must all be initialized");
      std::string str;
-      PADDLE_ENFORCE(proto->SerializeToString(&str),
+      PADDLE_ENFORCE(info.Proto().SerializeToString(&str),
                     "Serialize OpProto Error. This could be a bug of Paddle.");
-      ret_values.push_back(py::bytes(str));
+      ret_values.emplace_back(str);
-    }
+    });
    return ret_values;
  });
  m.def_submodule(
@@ -222,8 +224,10 @@ All parameter, weight, gradient are variables in Paddle.
                    retv->SetType("plain_net");
                    return retv;
                  })
-      .def("add_op", [](operators::NetOp &self,
+      .def("append_op",
-                        const OperatorBase &op) { self.AddOp(op); })
+           [](operators::NetOp &self, const OperatorBase &op) {
+             self.AppendOp(op);
+           })
      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
        self->CompleteAddOp();
@@ -243,10 +247,9 @@ All parameter, weight, gradient are variables in Paddle.
            auto rnn_op = OpRegistry::CreateOp(desc);
            return static_cast<operators::RecurrentOp *>(rnn_op.release());
          })
-      .def("set_stepnet", [](operators::RecurrentOp &self,
+      .def("set_stepnet",
-                             const operators::NetOp &net) -> void {
+           [](operators::RecurrentOp &self, const operators::NetOp &net)
-        self.set_stepnet(net.Clone());
+               -> void { self.set_stepnet(net.Clone()); });
-      });
  m.def("unique_integer", UniqueIntegerGenerator);

--- a/paddle/framework/tensor_py.h
+++ b/paddle/framework/tensor_py.h
@@ -63,8 +63,11 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
      }
      return py::buffer_info(
          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
-          sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
+          sizeof(CUR_TYPE),
-          (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
+          py::format_descriptor<CUR_TYPE>::format(),
+          (size_t)framework::arity(dst_tensor.dims()),
+          dims_outside,
+          strides);
    } else {
      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
@@ -107,8 +110,8 @@ void PyCUDATensorSetFromArray(
  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<T>(place);
-  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
+  paddle::platform::GpuMemcpySync(
-                                  cudaMemcpyHostToDevice);
+      dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice);
 }
 #endif

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -338,7 +338,8 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
        in_links_count += 1
        layer_name = MakeLayerNameInParentSubmodel(name)
        layer = g_layer_map[layer_name]
-        ScatterAgentLayer(name=name, size=layer.size)
+        ScatterAgentLayer(
+            name=name, size=layer.size, width=layer.width, height=layer.height)
        pair = g_current_submodel.in_links.add()
        pair.layer_name = layer_name
@@ -2197,8 +2198,8 @@ class MaxOutLayer(LayerBase):
        maxout_conf = self.config.inputs[0].maxout_conf
        parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf)
        out_channels = maxout_conf.image_conf.channels / maxout_conf.groups
-        self.set_cnn_layer(name, g_layer_map[input_layer.name].height,
+        self.set_cnn_layer(name, maxout_conf.image_conf.img_size_y,
-                           g_layer_map[input_layer.name].width, out_channels)
+                           maxout_conf.image_conf.img_size, out_channels)
 @config_layer('row_conv')
@@ -2232,6 +2233,20 @@ class ClipLayer(LayerBase):
        self.config.inputs[0].clip_conf.max = max
+@config_layer('scale_shift')
+class ScaleShiftLayer(LayerBase):
+    def __init__(self, name, inputs, bias=True, **xargs):
+        super(ScaleShiftLayer, self).__init__(
+            name, 'scale_shift', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'ScaleShiftLayer must have one and only one input.')
+        input_layer = self.get_input_layer(0)
+        self.set_layer_size(input_layer.size)
+        self.create_input_parameter(0, 1, [1, 1])
+        self.create_bias_parameter(bias, 1)
 # key: cost type
 # value: cost class
 g_cost_map = {}
@@ -2402,9 +2417,11 @@ class GatherAgentLayer(LayerBase):
 @config_layer('scatter_agent')
 class ScatterAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
+    def __init__(self, name, size, width=None, height=None, device=None):
        super(ScatterAgentLayer, self).__init__(
            name, 'scatter_agent', size, inputs=[], device=device)
+        if height and width:
+            self.set_layer_height_width(height, width)
 @config_layer('multiplex')
@@ -2688,6 +2705,49 @@ class SubSequenceLayer(LayerBase):
        self.create_bias_parameter(bias, size)
+@config_layer('seq_slice')
+class SeqSliceLayer(LayerBase):
+    def __init__(self, name, inputs, starts, ends, bias=False, **xargs):
+        if isinstance(inputs, list):
+            assert len(inputs) == 1, ('the first input of sequence slice layer '
+                                      'is a single sequence input.')
+        else:
+            inputs = [inputs]
+        if starts is not None:
+            if isinstance(starts, list):
+                assert len(starts) == 1, (
+                    'the start indices for sequence slice layer cannot '
+                    'be a list having more than one element.')
+                starts = starts[0]
+            inputs.append(starts)
+        if ends is not None:
+            if isinstance(ends, list):
+                assert len(ends) == 1, (
+                    'the end indices for sequence slice layer cannot '
+                    'be a list having more than one element.')
+                ends = ends[0]
+            inputs.append(ends)
+        assert len(inputs) >= 2, (
+            'the sequence slice layer has at least two inputs.')
+        super(SeqSliceLayer, self).__init__(
+            name, 'seq_slice', 0, inputs=inputs, **xargs)
+        input_layer0 = self.get_input_layer(0)
+        size = input_layer0.size
+        self.set_layer_size(size)
+        if len(inputs) == 3:
+            assert (
+                self.get_input_layer(1).size == self.get_input_layer(2).size), (
+                    'If start and end indices are both given to'
+                    'sequence slice layer, they should have the same width.')
+        elif len(inputs) == 2:
+            self.config.select_first = (starts is not None)
 @config_layer('sub_nested_seq')
 class SubNestedSequenceLayer(LayerBase):
    def __init__(self, name, inputs, selected_indices, bias=False, **xargs):

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -16,11 +16,13 @@ import functools
 import collections
 import inspect
+import paddle.trainer.config_parser as cp
 from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
    ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
-from .poolings import MaxPooling, AvgPooling, BasePoolingType
+from .poolings import MaxPooling, AvgPooling, BasePoolingType, \
+    CudnnAvgPooling, CudnnMaxPooling
 from .attrs import *
 from .default_decorators import *
@@ -133,7 +135,9 @@ __all__ = [
    'sub_nested_seq_layer',
    'clip_layer',
    'slice_projection',
+    'seq_slice_layer',
    'kmax_sequence_score_layer',
+    'scale_shift_layer',
 ]
@@ -230,8 +234,10 @@ class LayerType(object):
    CROP_LAYER = 'crop'
    SUB_NESTED_SEQ = 'sub_nested_seq'
    CLIP_LAYER = 'clip'
+    SEQ_SLICE = 'seq_slice'
    KMAX_SEQ_SCORE = 'kmax_seq_score'
+    SCALE_SHIFT_LAYER = 'scale_shift'
    @staticmethod
    def is_layer_type(type_name):
@@ -330,6 +336,14 @@ class LayerOutput(object):
        self.outputs = outputs
        self.reverse = reverse
+    @property
+    def width(self):
+        return cp.g_layer_map[self.full_name].width
+    @property
+    def height(self):
+        return cp.g_layer_map[self.full_name].height
    def set_input(self, input):
        """
        Set the input for a memory layer. Can only be used for memory layer
@@ -911,7 +925,13 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
        width=width,
        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.DATA, size=size)
+    num_filters = None
+    if height is not None and width is not None:
+        num_filters = size / (width * height)
+        assert num_filters * width * height == size, \
+            "size=%s width=%s height=%s" % (size, width, height)
+    return LayerOutput(name, LayerType.DATA, size=size, num_filters=num_filters)
 @wrap_name_default("embedding")
@@ -2571,6 +2591,10 @@ def img_pool_layer(input,
        assert input.num_filters is not None
        num_channels = input.num_filters
+    assert type(pool_type) in [AvgPooling, MaxPooling, CudnnAvgPooling,
+                               CudnnMaxPooling], \
+        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling are supported"
    if pool_type is None:
        pool_type = MaxPooling()
    elif isinstance(pool_type, AvgPooling):
@@ -2580,7 +2604,6 @@ def img_pool_layer(input,
        if (
        isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
        else pool_type.name
    pool_size_y = pool_size if pool_size_y is None else pool_size_y
    stride_y = stride if stride_y is None else stride_y
    padding_y = padding if padding_y is None else padding_y
@@ -4204,8 +4227,7 @@ def conv_operator(img,
        num_channels = img.num_filters
    assert isinstance(filter, LayerOutput)
-    if filter.size is not None:
+    assert filter.size is not None
-        filter.size = filter_size * filter_size_y * num_filters * num_channels
    opCls = ConvTransOperator if trans else ConvOperator
@@ -4916,7 +4938,6 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
-    assert input.layer_type == LayerType.CONV_LAYER
    assert isinstance(input.activation, LinearActivation)
    assert groups > 1
    if num_channels is None:
@@ -6238,6 +6259,72 @@ def clip_layer(input, min, max, name=None):
        name, LayerType.CLIP_LAYER, parents=[input], size=input.size)
+@wrap_name_default()
+def seq_slice_layer(input, starts, ends, name=None):
+    """
+    seq_slice_layer will return one or several sub-sequences from the
+    input sequence layer given start and end indices.
+        - If only start indices are given, and end indices are set to None,
+          this layer slices the input sequence from the given start indices
+          to its end.
+        - If only end indices are given, and start indices are set to None,
+          this layer slices the input sequence from its beginning to the
+          given end indices.
+        - If start and end indices are both given, they should have the same
+          number of elements.
+    If start or end indices contains more than one elements, the input sequence
+    will be sliced for multiple times.
+    .. code-block:: python
+        seq_silce = seq_slice_layer(input=input_seq,
+                                    starts=start_pos, ends=end_pos)
+    :param name: name of this layer.
+    :type name: basestring
+    :param input: input for this layer, it should be a sequence.
+    :type input: LayerOutput
+    :param starts: start indices to slice the input sequence.
+    :type starts: LayerOutput|None
+    :param ends: end indices to slice the input sequence.
+    :type ends: LayerOutput|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput), (
+        'The first input of seq_slice layer must be a PaddlePaddle layer.')
+    if starts is not None:
+        assert isinstance(starts, LayerOutput), (
+            'The start indices for seq_slice layer '
+            'must be a PaddlePaddle layer.')
+    if ends is not None:
+        assert isinstance(ends, LayerOutput), (
+            'The end indices for seq_slice layer must be a PaddlePaddle layer.')
+    assert starts is not None or ends is not None, (
+        'start and end indices '
+        'cannot be set to None at the same time, at least one of '
+        'them should be given.')
+    if starts is not None and ends is not None:
+        assert starts.size == ends.size, (
+            'If start and end indices are both given to seq_slice_layer, '
+            'they should have the same width.')
+    Layer(
+        name=name,
+        type=LayerType.SEQ_SLICE,
+        inputs=input.name,
+        starts=starts.name if starts is not None else None,
+        ends=ends.name if ends is not None else None)
+    return LayerOutput(
+        name, LayerType.SEQ_SLICE, parents=[input], size=input.size)
 @wrap_name_default()
 @layer_support()
 def kmax_sequence_score_layer(input, name=None, beam_size=1):
@@ -6274,3 +6361,43 @@ def kmax_sequence_score_layer(input, name=None, beam_size=1):
    return LayerOutput(
        name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size)
+@wrap_name_default("scale_shift")
+@wrap_param_attr_default()
+@wrap_bias_attr_default()
+def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
+    """
+    A layer applies a linear transformation to each element in each row of
+    the input matrix. For each element, the layer first re-scale it and then
+    adds a bias to it.
+    This layer is very like the SlopeInterceptLayer, except the scale and
+    bias are trainable.
+    .. math::
+        y = w * x + b
+    .. code-block:: python
+        scale_shift = scale_shift_layer(input=input_layer, bias_attr=False)
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput.
+    :param param_attr: The parameter attribute of scaling.
+    :type param_attr: ParameterAttribute
+    :param bias_attr: The parameter attribute of shifting.
+    :type bias_attr: ParameterAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.SCALE_SHIFT_LAYER,
+        inputs=Input(input.name, **param_attr.attr),
+        bias=ParamAttr.to_bias(bias_attr))
+    return LayerOutput(
+        name, LayerType.SCALE_SHIFT_LAYER, parents=[input], size=input.size)
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -8,6 +8,7 @@ test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
-test_kmax_seq_socre_layer test_seq_select_layers)
+test_kmax_seq_socre_layer test_seq_select_layers test_scale_shift_layer
+test_seq_slice_layer)
 export whole_configs=(test_split_datasource)
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
 type: "nn"
 layers {
-  name: "input"
+  name: "input_seq"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "data"
  type: "data"
  size: 128
  active_type: ""
@@ -17,7 +11,7 @@ layers {
  size: 1
  active_type: "exponential"
  inputs {
-    input_layer_name: "data"
+    input_layer_name: "input_seq"
    input_parameter_name: "___fc_layer_0__.w0"
  }
  bias_parameter_name: "___fc_layer_0__.wbias"
@@ -51,15 +45,14 @@ parameters {
  initial_strategy: 0
  initial_smart: false
 }
-input_layer_names: "data"
+input_layer_names: "input_seq"
 output_layer_names: "__kmax_sequence_score_layer_0__"
 sub_models {
  name: "root"
-  layer_names: "input"
+  layer_names: "input_seq"
-  layer_names: "data"
  layer_names: "__fc_layer_0__"
  layer_names: "__kmax_sequence_score_layer_0__"
-  input_layer_names: "data"
+  input_layer_names: "input_seq"
  output_layer_names: "__kmax_sequence_score_layer_0__"
  is_recurrent_layer_group: false
 }

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__scale_shift_0__"
+  type: "scale_shift"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___scale_shift_0__.w0"
+  }
+}
+layers {
+  name: "__scale_shift_1__"
+  type: "scale_shift"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___scale_shift_1__.w0"
+  }
+  bias_parameter_name: "___scale_shift_1__.wbias"
+}
+parameters {
+  name: "___scale_shift_0__.w0"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___scale_shift_1__.w0"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___scale_shift_1__.wbias"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__scale_shift_0__"
+output_layer_names: "__scale_shift_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__scale_shift_0__"
+  layer_names: "__scale_shift_1__"
+  input_layer_names: "data"
+  output_layer_names: "__scale_shift_0__"
+  output_layer_names: "__scale_shift_1__"
+  is_recurrent_layer_group: false
+}
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
+type: "nn"
+layers {
+  name: "word"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "starts"
+  type: "data"
+  size: 5
+  active_type: ""
+}
+layers {
+  name: "ends"
+  type: "data"
+  size: 5
+  active_type: ""
+}
+layers {
+  name: "__seq_slice_layer_0__"
+  type: "seq_slice"
+  size: 128
+  active_type: ""
+  inputs {
+    input_layer_name: "word"
+  }
+  inputs {
+    input_layer_name: "starts"
+  }
+  inputs {
+    input_layer_name: "ends"
+  }
+}
+layers {
+  name: "__seq_slice_layer_1__"
+  type: "seq_slice"
+  size: 128
+  active_type: ""
+  inputs {
+    input_layer_name: "word"
+  }
+  inputs {
+    input_layer_name: "starts"
+  }
+  select_first: true
+}
+layers {
+  name: "__seq_slice_layer_2__"
+  type: "seq_slice"
+  size: 128
+  active_type: ""
+  inputs {
+    input_layer_name: "word"
+  }
+  inputs {
+    input_layer_name: "ends"
+  }
+  select_first: false
+}
+input_layer_names: "word"
+output_layer_names: "__seq_slice_layer_0__"
+output_layer_names: "__seq_slice_layer_1__"
+output_layer_names: "__seq_slice_layer_2__"
+sub_models {
+  name: "root"
+  layer_names: "word"
+  layer_names: "starts"
+  layer_names: "ends"
+  layer_names: "__seq_slice_layer_0__"
+  layer_names: "__seq_slice_layer_1__"
+  layer_names: "__seq_slice_layer_2__"
+  input_layer_names: "word"
+  output_layer_names: "__seq_slice_layer_0__"
+  output_layer_names: "__seq_slice_layer_1__"
+  output_layer_names: "__seq_slice_layer_2__"
+  is_recurrent_layer_group: false
+}
--- a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
@@ -2,9 +2,7 @@
 #coding=utf-8
 from paddle.trainer_config_helpers import *
-data = data_layer(name='input', size=300)
+data = data_layer(name="input_seq", size=128)
-data = data_layer(name="data", size=128)
 scores = fc_layer(input=data, size=1, act=ExpActivation())
 kmax_seq_id = kmax_sequence_score_layer(input=scores, beam_size=5)

--- a/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
+from paddle.trainer_config_helpers import *
+data = data_layer(name='data', size=100)
+scale = scale_shift_layer(input=data, bias_attr=False)
+scale_shift = scale_shift_layer(input=data)
+outputs(scale, scale_shift)
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
+#!/usr/bin/env python
+#coding=utf-8
+from paddle.trainer_config_helpers import *
+input_seq = data_layer("word", size=128)
+starts = data_layer("starts", size=5)
+ends = data_layer("ends", size=5)
+seq_slice1 = seq_slice_layer(input=input_seq, starts=starts, ends=ends)
+seq_slice2 = seq_slice_layer(input=input_seq, starts=starts, ends=None)
+seq_slice3 = seq_slice_layer(input=input_seq, starts=None, ends=ends)
+outputs(seq_slice1, seq_slice2, seq_slice3)
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -13,6 +13,7 @@ py_test(test_add_two_op SRCS test_add_two_op.py)
 py_test(test_sigmoid_op SRCS test_sigmoid_op.py)
 py_test(test_softmax_op SRCS test_softmax_op.py)
 py_test(test_cross_entropy_op SRCS test_cross_entropy_op.py)
+py_test(test_gather_op SRCS test_gather_op.py)
 py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py)
 py_test(gradient_checker SRCS gradient_checker.py)
@@ -22,8 +23,10 @@ py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)
 py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
 py_test(test_operator SRCS test_operator.py)
-# py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
+py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
 py_test(test_uniform_random_op SRCS test_uniform_random_op.py)
 py_test(test_recurrent_op SRCS test_recurrent_op.py)
 py_test(test_sgd_op SRCS test_sgd_op.py)
 py_test(test_gradient_checker SRCS test_gradient_checker.py)
+py_test(test_scale_and_identity_op SRCS test_scale_and_identity_op.py)
+py_test(mnist SRCS mnist.py)
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -160,8 +160,13 @@ class GradientChecker(unittest.TestCase):
            grad_tensor.set(data, place)
        # run backward op
-        for name in backward_op.outputs():
+        backward_outs = backward_op.outputs()
+        backward_names = [
+            item for key in backward_outs for item in backward_outs[key]
+        ]
+        for name in backward_names:
            scope.new_var(name)
        backward_op.infer_shape(scope)
        backward_op.run(scope, ctx)

--- a/python/paddle/v2/framework/tests/mnist.py
+++ b/python/paddle/v2/framework/tests/mnist.py
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+import numpy
+import paddle.v2 as paddle
+BATCH_SIZE = 100
+scope = core.Scope()
+place = core.CPUPlace()
+# if you want to test GPU training, you can use gpu place
+# place = core.GPUPlace(0)
+dev_ctx = core.DeviceContext.create(place)
+init_net = core.Net.create()
+forward_net = core.Net.create()
+backward_net = None
+optimize_net = core.Net.create()
+def atomic_id():
+    id = 0
+    while True:
+        yield id
+        id += 1
+uniq_id = atomic_id().next
+def data_layer(name, dims):
+    var = scope.new_var(name)
+    tensor = var.get_tensor()
+    tensor.set_dims(dims)  # 1 is batch size holder.
+    return name
+def feed_data(name, data):
+    assert isinstance(data, numpy.ndarray)
+    tensor = scope.find_var(name).get_tensor()
+    tensor.set_dims(data.shape)
+    if data.dtype == numpy.dtype('int32'):
+        tensor.alloc_int(place)
+    elif data.dtype == numpy.dtype('float32'):
+        tensor.alloc_float(place)
+    else:
+        raise ValueError("data type not supported")
+    tensor.set(data, place)
+def grad_var_name(var_name):
+    return var_name + "@GRAD"
+def sgd_optimizer(net, param_name, learning_rate=0.005):
+    grad_name = grad_var_name(param_name)
+    optimize_op = Operator(
+        "sgd",
+        param=param_name,
+        grad=grad_name,
+        param_out=param_name,
+        learning_rate=learning_rate)
+    net.append_op(optimize_op)
+# should use operator and add these to the init_network
+def init_param(net, param_name, dims):
+    scope.new_var(param_name)
+    op = Operator(
+        "uniform_random", Out=param_name, dims=dims, min=-0.5, max=0.5, seed=10)
+    op.infer_shape(scope)
+    net.append_op(op)
+# fc_layer
+def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
+    """
+    Add a fc layer to net
+    :param input: input variable name.
+    :type input: str
+    :param size: fully connected layer size.
+    :param act: activation name
+    :param param: parameter attribute, used for initialize parameters.
+    :param bias: bias attribute. False will not have a bias.
+    :param name: the name of fc layer. If not set, model will generate a
+    readable name
+    :return: output variable name.
+    """
+    if name is None:
+        name = 'fc_%d' % uniq_id()
+    if not isinstance(name, str):
+        raise ValueError("name should be string")
+    input_dims = scope.find_var(input).get_tensor().get_dims()
+    w_name = param or name + ".w"
+    init_param(net=init_net, param_name=w_name, dims=[input_dims[1], size])
+    sgd_optimizer(net=optimize_net, param_name=w_name, learning_rate=0.01)
+    pre_activation = name + ".mul.out"
+    scope.new_var(pre_activation)
+    mul_op = Operator("mul", X=input, Y=w_name, Out=pre_activation)
+    net.append_op(mul_op)
+    # create bias variable if needed
+    if bias:
+        bias_name = name + ".b"
+        init_param(net=init_net, param_name=bias_name, dims=[size])
+        sgd_optimizer(
+            net=optimize_net, param_name=bias_name, learning_rate=0.001)
+        bias_out = name + ".rowwise_add.out"
+        scope.new_var(bias_out)
+        rowwise_append_op = Operator(
+            "rowwise_add", X=pre_activation, b=bias_name, Out=bias_out)
+        net.append_op(rowwise_append_op)
+        pre_activation = bias_out
+    activation_op = Operator(act, X=pre_activation, Y=name)
+    net.append_op(activation_op)
+    scope.new_var(name)
+    net.infer_shape(scope)
+    return name
+def cross_entropy_layer(net, input, label):
+    cost_name = 'cross_entropy_%d' % uniq_id()
+    cross_entropy_op = Operator(
+        "onehot_cross_entropy", X=input, label=label, Y=cost_name)
+    net.append_op(cross_entropy_op)
+    scope.new_var(cost_name)
+    net.infer_shape(scope)
+    return cost_name
+def create_backward_net(forward_net):
+    net = core.Operator.backward(forward_net, set())
+    for input in net.inputs()["all"]:
+        var = scope.new_var(input)
+        var.get_tensor()
+    for output in net.outputs()["all"]:
+        var = scope.new_var(output)
+        var.get_tensor()
+    return net
+def debug_print_op(op):
+    print("===============" + op.type() + "==============")
+    print("***inputs:***")
+    for input in op.inputs()["all"]:
+        print input, scope.find_var(input).get_tensor().get_dims()
+    print("\n***outputs:***")
+    for output in op.outputs()["all"]:
+        print output, scope.find_var(output).get_tensor().get_dims()
+    print("")
+    print("")
+def set_cost(cost):
+    cost_shape = numpy.array(scope.find_var(cost).get_tensor()).shape
+    cost_grad = \
+        scope.find_var(grad_var_name(cost)).get_tensor()
+    cost_grad.set_dims(cost_shape)
+    cost_grad.alloc_float(place)
+    cost_grad.set(numpy.ones(cost_shape).astype("float32"), place)
+def get_cost_mean(cost):
+    cost_data = numpy.array(scope.find_var(cost).get_tensor())
+    return cost_data.sum() / len(cost_data)
+def error_rate(predict, label):
+    predict_var = numpy.array(scope.find_var(predict).get_tensor()).argmax(
+        axis=1)
+    label = numpy.array(scope.find_var(label).get_tensor())
+    error_num = numpy.sum(predict_var != label)
+    return error_num / float(len(label))
+images = data_layer(name='pixel', dims=[BATCH_SIZE, 784])
+labels = data_layer(name='label', dims=[BATCH_SIZE])
+fc1 = fc_layer(net=forward_net, input=images, size=100, act="sigmoid")
+fc2 = fc_layer(net=forward_net, input=fc1, size=100, act="sigmoid")
+predict = fc_layer(net=forward_net, input=fc2, size=100, act="softmax")
+cost = cross_entropy_layer(net=forward_net, input=predict, label=labels)
+init_net.complete_add_op(True)
+forward_net.complete_add_op(True)
+backward_net = create_backward_net(forward_net)
+optimize_net.complete_add_op(True)
+print(init_net)
+print(forward_net)
+print(backward_net)
+print(optimize_net)
+debug_print_op(forward_net)
+debug_print_op(backward_net)
+debug_print_op(optimize_net)
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=8192),
+    batch_size=BATCH_SIZE)
+def test(cost_name):
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+    cost = []
+    error = []
+    for data in test_reader():
+        image_data = numpy.array(map(lambda x: x[0], data)).astype("float32")
+        label_data = numpy.array(map(lambda x: x[1], data)).astype("int32")
+        feed_data(images, image_data)
+        feed_data(labels, label_data)
+        forward_net.infer_shape(scope)
+        forward_net.run(scope, dev_ctx)
+        cost.append(get_cost_mean(cost_name))
+        error.append(error_rate(predict, "label"))
+    print("cost=" + str(sum(cost) / float(len(cost))) + " error_rate=" + str(
+        sum(error) / float(len(error))))
+PASS_NUM = 1
+init_net.run(scope, dev_ctx)
+for pass_id in range(PASS_NUM):
+    batch_id = 0
+    for data in train_reader():
+        image_data = numpy.array(map(lambda x: x[0], data)).astype("float32")
+        label_data = numpy.array(map(lambda x: x[1], data)).astype("int32")
+        feed_data(images, image_data)
+        feed_data(labels, label_data)
+        forward_net.infer_shape(scope)
+        forward_net.run(scope, dev_ctx)
+        set_cost(cost)
+        backward_net.infer_shape(scope)
+        backward_net.run(scope, dev_ctx)
+        optimize_net.run(scope, dev_ctx)
+        if batch_id % 100 == 0:
+            print("pass[" + str(pass_id) + "] batch_id[" + str(batch_id) + "]")
+            test(cost)
+        batch_id = batch_id + 1
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -64,7 +64,8 @@ class OpTestMeta(type):
                    actual = numpy.array(scope.find_var(out_name).get_tensor())
                    expect = self.outputs[out_name]
                    self.assertTrue(
-                        numpy.allclose(actual, expect),
+                        numpy.allclose(
+                            actual, expect, atol=1e-05),
                        "output name: " + out_name + "has diff")
        obj.test_all = test_all

--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -8,9 +8,8 @@ class TestCrossEntropy(unittest.TestCase):
    __metaclass__ = OpTestMeta
    def setUp(self):
-        # TODO this unit test is not passed
        self.type = "onehot_cross_entropy"
-        batch_size = 100
+        batch_size = 30
        class_num = 10
        X = numpy.random.random((batch_size, class_num)).astype("float32")
        label = 5 * numpy.ones(batch_size).astype("int32")
@@ -22,9 +21,9 @@ class TestCrossEntropy(unittest.TestCase):
 class CrossEntropyGradOpTest(GradientChecker):
-    def test_softmax_grad(self):
+    def test_check_grad(self):
        op = create_op("onehot_cross_entropy")
-        batch_size = 100
+        batch_size = 30
        class_num = 10
        inputs = {
            "X": numpy.random.uniform(

--- a/python/paddle/v2/framework/tests/test_gather_op.py
+++ b/python/paddle/v2/framework/tests/test_gather_op.py
+import unittest
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+import numpy
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+class TestGatherOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+    def setUp(self):
+        self.type = "gather"
+        xnp = numpy.random.random((10, 20)).astype("float32")
+        self.inputs = {
+            'X': xnp,
+            'Index': numpy.array([1, 3, 5]).astype("int32")
+        }
+        self.outputs = {'Out': self.inputs['X'][self.inputs['Index']]}
+class TestGatherGradOp(GradientChecker):
+    def test_gather_grad(self):
+        print 'creating op'
+        op = create_op("gather")
+        print 'creating op done'
+        xnp = numpy.random.random((10, 20)).astype("float32")
+        inputs = {'X': xnp, 'Index': numpy.array([1, 3, 5]).astype("int32")}
+        print 'correct before check gradient'
+        self.check_grad(op, inputs, set("X"), "Out")
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_minus_op.py
+++ b/python/paddle/v2/framework/tests/test_minus_op.py
+import unittest
+import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+class MinusOpTest(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+    def setUp(self):
+        self.type = "minus"
+        self.inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((32, 84)).astype("float32")
+        }
+        self.outputs = {'Out': (self.inputs['X'] - self.inputs['Y'])}
+class MinusGradTest(GradientChecker):
+    def test_left(self):
+        op = create_op("minus")
+        inputs = {
+            "X": np.random.random((10, 10)).astype("float32"),
+            "Y": np.random.random((10, 10)).astype("float32")
+        }
+        self.check_grad(op, inputs, ["X", 'Y'], "Out")
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
--- a/python/paddle/v2/framework/tests/test_scale_and_identity_op.py
+++ b/python/paddle/v2/framework/tests/test_scale_and_identity_op.py