diff --git a/.travis.yml b/.travis.yml
index 376c693602b56fe719decfeb41c217497e143e12..8c8c6699d3d9abddd65a3a224c2bceedc7d88348 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -38,7 +38,7 @@ before_install:
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
   # protobuf version.
   - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
-  - pip install rarfile
+  - pip install rarfile nltk==3.2.2 scipy==0.19.0 recordio matplotlib Pillow
   - curl https://glide.sh/get | bash
   - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
   - go get -u github.com/alecthomas/gometalinter
diff --git a/Dockerfile b/Dockerfile
index 156ad3552b2c4ff90b405c35c66d44117c2624a4..06a3d8930769bca2599a7afedb3683b2207cb302 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -38,17 +38,16 @@ RUN apt-get update && \
 RUN pip --no-cache-dir install 'numpy>=1.12.0'
 
 # Install Go and glide
-RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
-    tar -C /usr/local -xzf go.tgz && \
+RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
     mkdir /root/gopath && \
     mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src && \
-    rm go.tgz
+    mkdir /root/gopath/src
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
 ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 # install glide
-RUN curl -q https://glide.sh/get | sh
+RUN curl -s -q https://glide.sh/get | sh
 
 # git credential to skip password typing
 RUN git config --global credential.helper store
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 372272a53c12c314fc80eebbce5eae9fcabc55ba..cb330ea5e1b914587a725c9b90a33053f3fbbc3d 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -257,6 +257,16 @@ seq_concat
 ..  autoclass:: paddle.v2.layer.seq_concat
     :noindex:
 
+kmax_sequence_score
+-------------------
+..  autoclass:: paddle.v2.layer.kmax_sequence_score
+    :noindex:
+
+sub_nested_seq
+--------------
+..  autoclass:: paddle.v2.layer.sub_nested_seq
+    :noindex:
+
 Reshaping Layers
 ================
 
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
index 3692a5248a355cfcfd1cfd0911d43d65166921b1..0c10e782808ca6456347ec54cb5e921162731ede 100644
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -11,6 +11,15 @@ Paddle每次发新的版本，遵循以下流程:
 	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
 	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
 		* 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，返回第二步
+	* 编译这个版本的python wheel包，并发布到pypi。
+		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu，如果要上传GPU版本的包，需要修改build/python/setup.py中，name: "paddlepaddle_gpu"并重新打包wheel包：`python setup.py bdist_wheel`。
+		* 上传方法：
+			```
+			cd build/python
+			pip install twine
+			twine upload dist/[package to upload]
+			```
 4. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
 5. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
 6. 协同完成Release Note的书写
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 95cad835b11816f4d2e256c2abd662a545a5bad2..673948dfe7928240817b552141ec9bc2f8a672b7 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -13,15 +13,11 @@
 # serve to show the default.
 import sys
 import os, subprocess
+sys.path.insert(0, os.path.abspath('@PROJ_ROOT@/python'))
 import shlex
 from recommonmark import parser, transform
-try:
-   import py_paddle
-   import paddle
-   import paddle.v2
-except ImportError:
-   print("Must install paddle python package before generating documentation")
-   sys.exit(1)
+import paddle
+import paddle.v2
 
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index b477f0120c4fa0544012080b7cfb8572d3c44b04..b6b50b7dcd5647b50a13703160489323ed90a1b4 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -13,15 +13,11 @@
 # serve to show the default.
 import sys
 import os, subprocess
+sys.path.insert(0, os.path.abspath('@PROJ_ROOT@/python'))
 import shlex
 from recommonmark import parser, transform
-try:
-   import py_paddle
-   import paddle
-   import paddle.v2
-except ImportError:
-   print("Must install paddle python package before generating documentation")
-   sys.exit(1)
+import paddle
+import paddle.v2
 
 
 MarkdownParser = parser.CommonMarkParser
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 31f778d53ba2867d61be1f87db8189981ace9e2b..04659639910b3b073c1d15f419aa6996360519e0 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -31,13 +31,17 @@ add_dependencies(framework_py_proto framework_py_proto_init)
 
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward)
+
+if(WITH_PYTHON)
 cc_library(paddle_pybind SHARED
     SRCS pybind.cc
     DEPS pybind python backward
-	fc_op
-	sgd_op
-	add_op
-	mean_op
-	cross_entropy_op
-	fill_zeros_like_op
-	recurrent_op)
+    fc_op
+    sgd_op
+    add_op
+    mean_op
+    cross_entropy_op
+    recurrent_op
+    uniform_random_op
+    fill_zeros_like_op)
+endif(WITH_PYTHON)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 10a3f49810f75b3bfe32b20c64c94295368b0b49..8f39b79cf729774d9057b3b68d71d2b4907da77a 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/framework/backward.h"
+
 #include <list>
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 8e85a2510fe9bd4f8bbd67d12b76232a4f268e61..653b5693e8da0a977c28251e6aa55c5be820c7a5 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -17,16 +17,21 @@
 #include <gtest/gtest.h>
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
-#include "paddle/operators/type_alias.h"
 
 namespace paddle {
 namespace framework {
 
+using OperatorBase = framework::OperatorBase;
+using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
+using OpProto = framework::OpProto;
+using OpAttrChecker = framework::OpAttrChecker;
+using Scope = framework::Scope;
+using DeviceContext = platform::DeviceContext;
+
 class EmptyOp : public OperatorBase {
  public:
   void InferShape(const Scope &scope) const override {}
-  void Run(const Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {}
+  void Run(const Scope &scope, const DeviceContext &dev_ctx) const override {}
 };
 
 class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
@@ -71,7 +76,7 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-class FcOp : public ops::NetOp {
+class FcOp : public operators::NetOp {
  public:
   void Init() override {
     AddOp(OpRegistry::CreateOp("mul",
@@ -145,6 +150,7 @@ class AddOpMaker : public OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace f = paddle::framework;
+namespace ops = paddle::operators;
 using EnforceNotMet = paddle::platform::EnforceNotMet;
 REGISTER_OP(rowwise_add, f::EmptyOp, f::RowWiseAddOpMaker);
 REGISTER_GRADIENT_OP(rowwise_add, rowwise_add_grad, f::EmptyOp);
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 9123e9b56fd068d7df59160e8987bc51bbf511df..db23fd7bf938a1b2e97347d46c2f58efb9773009 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -204,12 +204,6 @@ class OpRegistry {
     return CreateOp(op_desc.type(), inputs, outputs, attrs);
   }
 
-  static bool SupportGPU(const std::string& op_type) {
-    OperatorWithKernel::OpKernelKey key;
-    key.place_ = platform::GPUPlace();
-    return OperatorWithKernel::AllOpKernels().at(op_type).count(key) != 0;
-  }
-
   static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) {
     PADDLE_ENFORCE(!op.IsNetOp(),
                    "Use framework::Backward to get backward ops");
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index ec498ce3bd8b5d40c3c3ad081c217c4c7e8dd593..698ff5f36ddf0a621b6cf8d07d9f57f9074854b8 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -87,6 +87,8 @@ class OperatorBase {
 
   virtual bool IsNetOp() const { return false; }
 
+  virtual bool SupportGPU() const { return false; }
+
   /// rename inputs outputs name
   void Rename(const std::string& old_name, const std::string& new_name);
 
@@ -160,14 +162,14 @@ class OperatorContext {
   template <typename T>
   const T* Input(const std::string& name) const {
     auto var = InputVar(name);
-    PADDLE_ENFORCE(var != nullptr, "Input(%s) should not be nullptr", name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Input(%s) should not be nullptr", name);
     return &var->Get<T>();
   }
 
   template <typename T>
   T* Output(const std::string& name) const {
     auto var = OutputVar(name);
-    PADDLE_ENFORCE(var != nullptr, "Output(%s) should not be nullptr", name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Output(%s) should not be nullptr", name);
     return var->GetMutable<T>();
   }
 
@@ -179,9 +181,9 @@ class OperatorContext {
     std::transform(names.begin(), names.end(), std::back_inserter(res),
                    [&](const std::string& sub_name) {
                      auto var = scope_.FindVar(sub_name);
-                     PADDLE_ENFORCE(var != nullptr,
-                                    "MultiInput(%s:%s) should not be nullptr",
-                                    name, sub_name);
+                     PADDLE_ENFORCE_NOT_NULL(
+                         var, "MultiInput(%s:%s) should not be nullptr", name,
+                         sub_name);
                      return &var->Get<T>();
                    });
     return res;
@@ -195,9 +197,9 @@ class OperatorContext {
     std::transform(names.begin(), names.end(), std::back_inserter(res),
                    [&](const std::string& sub_name) {
                      auto var = scope_.FindVar(sub_name);
-                     PADDLE_ENFORCE(var != nullptr,
-                                    "MultiOutput(%s:%s) should not be nullptr",
-                                    name, sub_name);
+                     PADDLE_ENFORCE_NOT_NULL(
+                         var, "MultiOutput(%s:%s) should not be nullptr", name,
+                         sub_name);
                      return var->GetMutable<T>();
                    });
     return res;
@@ -283,7 +285,7 @@ class OperatorWithKernel : public OperatorBase {
   using OpKernelMap =
       std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
 
-  void InferShape(const Scope& scope) const {
+  void InferShape(const Scope& scope) const override {
     InferShape(InferShapeContext(this, scope));
   }
 
@@ -299,6 +301,12 @@ class OperatorWithKernel : public OperatorBase {
     return g_all_op_kernels;
   }
 
+  bool SupportGPU() const override {
+    OperatorWithKernel::OpKernelKey key;
+    key.place_ = platform::GPUPlace();
+    return OperatorWithKernel::AllOpKernels().at(type_).count(key) != 0;
+  }
+
  protected:
   virtual void InferShape(const InferShapeContext& ctx) const = 0;
 };
diff --git a/paddle/framework/pybind.cc b/paddle/framework/pybind.cc
index bba3af70258d9f037231f7984769ddb72eb373cb..3b3f33c8e212e9005beccdedf1d6b33e8c4a45ce 100644
--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@@ -18,11 +18,8 @@ limitations under the License. */
 
 #include "paddle/framework/backward.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/scope.h"
 #include "paddle/framework/tensor_py.h"
 #include "paddle/operators/net_op.h"
-#include "paddle/operators/type_alias.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "pybind11/numpy.h"
@@ -42,8 +39,12 @@ USE_OP(softmax);
 USE_OP(rowwise_add);
 USE_OP(fill_zeros_like);
 USE_OP_WITHOUT_KERNEL(recurrent_op);
+USE_OP(uniform_random);
 namespace paddle {
 namespace framework {
+
+using Tensor = framework::Tensor;
+
 template <typename ClassType>
 void ExposeOperator(ClassType &m) {
   m.def("infer_shape", &ClassType::type::InferShape)
@@ -130,8 +131,8 @@ All parameter, weight, gradient are variables in Paddle.
            [](Variable &self) -> Tensor * { return self.GetMutable<Tensor>(); },
            py::return_value_policy::reference)
       .def("get_net",
-           [](Variable &self) -> ops::NetOp * {
-             return self.GetMutable<ops::NetOp>();
+           [](Variable &self) -> operators::NetOp * {
+             return self.GetMutable<operators::NetOp>();
            },
            py::return_value_policy::reference);
 
@@ -202,8 +203,6 @@ All parameter, weight, gradient are variables in Paddle.
     return OpRegistry::CreateOp(desc);
   });
 
-  operator_base.def_static("support_gpu", &OpRegistry::SupportGPU);
-
   operator_base.def("backward",
                     [](const OperatorBase &forwardOp,
                        const std::unordered_set<std::string> &no_grad_vars) {
@@ -212,23 +211,24 @@ All parameter, weight, gradient are variables in Paddle.
 
   ExposeOperator(operator_base);
 
-  py::class_<ops::NetOp, std::shared_ptr<ops::NetOp>> net(m, "Net");
+  py::class_<operators::NetOp, std::shared_ptr<operators::NetOp>> net(m, "Net");
 
   net.def_static("create",
-                 []() -> std::shared_ptr<ops::NetOp> {
-                   auto retv = std::make_shared<ops::NetOp>();
+                 []() -> std::shared_ptr<operators::NetOp> {
+                   auto retv = std::make_shared<operators::NetOp>();
                    retv->type_ = "plain_net";
                    return retv;
                  })
-      .def("add_op", &ops::NetOp::AddOp)
-      .def(
-          "add_op",
-          [](ops::NetOp &self, const std::shared_ptr<ops::NetOp> &net) -> void {
-            self.AddOp(std::static_pointer_cast<OperatorBase>(net));
-          })
-      .def("complete_add_op", &ops::NetOp::CompleteAddOp)
-      .def("complete_add_op",
-           [](std::shared_ptr<ops::NetOp> &self) { self->CompleteAddOp(); });
+      .def("add_op", &operators::NetOp::AddOp)
+      .def("add_op",
+           [](operators::NetOp &self,
+              const std::shared_ptr<operators::NetOp> &net) -> void {
+             self.AddOp(std::static_pointer_cast<OperatorBase>(net));
+           })
+      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
+      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
+        self->CompleteAddOp();
+      });
 
   ExposeOperator(net);
 
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 4c3b14b83d841e88683a13634c93f51c012128b6..c44df05e4b0fceed858fbf4f68eddc407a44c894 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -127,8 +127,8 @@ class Tensor {
                memory::PODDeleter<T, Place>(place)),
           place_(place),
           size_(size) {
-      PADDLE_ENFORCE(ptr_ != nullptr, "Insufficient %s memory to allocation.",
-                     is_cpu_place(place_) ? "CPU" : "GPU");
+      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
+                              (is_cpu_place(place_) ? "CPU" : "GPU"));
     }
 
     virtual size_t size() const { return size_; }
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 92621f8c18ec0d03160a23c462830d14272c7f64..8d9bec6dc9c3f0af822a0d8cd8588dc932970652 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -14,17 +14,18 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 
 template <typename T>
 inline void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE(holder_ != nullptr,
-                 "Tenosr holds no memory. Call Tensor::mutable_data first.");
-  PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
-                 "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-                 "first to re-allocate memory.");
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_, "Tenosr holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_GE(holder_->size(), product(dims_) * sizeof(T) + offset_,
+                    "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+                    "first to re-allocate memory.");
 }
 
 template <typename T>
@@ -51,9 +52,9 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
 template <typename T>
 inline T* Tensor::mutable_data(platform::Place place) {
   static_assert(std::is_pod<T>::value, "T must be POD");
-  PADDLE_ENFORCE(product(dims_) > 0,
-                 "Tensor's numel must be larger than zero to call "
-                 "Tensor::mutable_data. Call Tensor::set_dim first.");
+  PADDLE_ENFORCE_GT(product(dims_), 0,
+                    "Tensor's numel must be larger than zero to call "
+                    "Tensor::mutable_data. Call Tensor::set_dim first.");
   /* some versions of boost::variant don't have operator!= */
   size_t size = product(dims_) * sizeof(T);
   if (holder_ == nullptr || !(holder_->place() == place) ||
@@ -120,11 +121,11 @@ inline void Tensor::CopyFrom(const Tensor& src,
 template <typename T>
 inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
   check_memory_size<T>();
-  PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
-  PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
-  PADDLE_ENFORCE(begin_idx < end_idx,
-                 "Begin index must be less than end index.");
-  PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
+  PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero.");
+  PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound.");
+  PADDLE_ENFORCE_LT(begin_idx, end_idx,
+                    "Begin index must be less than end index.");
+  PADDLE_ENFORCE_NE(dims_[0], 1, "Can not slice a tensor with dims_[0] = 1.");
   int base = product(dims_) / dims_[0];
   Tensor dst;
   dst.holder_ = holder_;
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index ef1cc10b840896d9ab97f963fc12a4971cd74e1f..20276181b974bb5b3d6cb40fb5e6c1295cf1c02f 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -36,7 +36,8 @@ TEST(Tensor, DataAssert) {
   } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg =
-        "Tenosr holds no memory. Call Tensor::mutable_data first.";
+        "holder_ should not be null\nTenosr holds no memory. Call "
+        "Tensor::mutable_data first.";
     const char* what = err.what();
     for (size_t i = 0; i < msg.length(); ++i) {
       ASSERT_EQ(what[i], msg[i]);
@@ -111,7 +112,8 @@ TEST(Tensor, ShareDataWith) {
     } catch (paddle::platform::EnforceNotMet err) {
       caught = true;
       std::string msg =
-          "Tenosr holds no memory. Call Tensor::mutable_data first.";
+          "holder_ should not be null\nTenosr holds no memory. Call "
+          "Tensor::mutable_data first.";
       const char* what = err.what();
       for (size_t i = 0; i < msg.length(); ++i) {
         ASSERT_EQ(what[i], msg[i]);
diff --git a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ce591d4762466e1ed4b2970cb9cae9203bc0a2b
--- /dev/null
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+class KmaxSeqScoreLayer : public Layer {
+private:
+  MatrixPtr scores_;
+  size_t beamSize_;
+  void kmaxScorePerSeq(const real* score,
+                       real* sortedRes,
+                       const ICpuGpuVectorPtr seqStartPos);
+
+public:
+  explicit KmaxSeqScoreLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(kmax_seq_score, KmaxSeqScoreLayer);
+
+bool KmaxSeqScoreLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  bool ret = Layer::init(layerMap, parameterMap);
+  CHECK_EQ(1U, inputLayers_.size());
+
+  beamSize_ = config_.beam_size();
+  CHECK_GE(beamSize_, 1U);
+
+  setNeedSequenceInfo(false);
+  setNeedGradient(false);
+  return ret;
+}
+
+void KmaxSeqScoreLayer::kmaxScorePerSeq(const real* scores,
+                                        real* sortedIds,
+                                        const ICpuGpuVectorPtr seqStartPos) {
+  int* starts = seqStartPos->getMutableData(false);
+  std::vector<real> indices;
+  for (size_t i = 0; i < seqStartPos->getSize() - 1; ++i) {
+    int seqLen = starts[i + 1] - starts[i];
+    int k = std::min(static_cast<int>(beamSize_), seqLen);
+
+    indices.resize(seqLen, 0);
+    std::iota(begin(indices), end(indices), 0.);
+    std::vector<real> tmpScore(scores + starts[i], scores + starts[i + 1]);
+    std::partial_sort(
+        begin(indices),
+        begin(indices) + k,
+        end(indices),
+        [&](size_t a, size_t b) { return tmpScore[a] > tmpScore[b]; });
+    memcpy(sortedIds + (i * beamSize_), indices.data(), k * sizeof(real));
+  }
+}
+
+void KmaxSeqScoreLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  const MatrixPtr inputScore = getInputValue(0);
+
+  CHECK(input.hasSeq() || input.hasSubseq())
+      << "input of " << getName()
+      << " must be a sequence or a nested sequence.";
+  CHECK_EQ(input.value->getWidth(), 1UL)
+      << "input of " << getName()
+      << " is score over a sequence or a nested sequence, so its width "
+      << " must be 1.";
+
+  if (useGpu_) {
+    // this Layer runs only in CPU, if the model is runing on GPU,
+    // then copy the input to this layer from GPU to CPU.
+    Matrix::resizeOrCreate(scores_,
+                           inputScore->getHeight(),
+                           1,
+                           false /* trans */,
+                           false /* useGpu */);
+    scores_->copyFrom(*inputScore);
+  } else {
+    scores_ = inputScore;
+  }
+
+  Matrix::resizeOrCreate(
+      output_.value,
+      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
+      beamSize_,
+      false,
+      false);
+  output_.value->one();
+  output_.value->mulScalar(-1.);
+
+  kmaxScorePerSeq(scores_->getData(),
+                  output_.value->getData(),
+                  input.hasSubseq() ? input.subSequenceStartPositions
+                                    : input.sequenceStartPositions);
+}
+
+void KmaxSeqScoreLayer::backward(const UpdateCallback& callback) {}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..76f587fff760d9eb9c2a8eeed53abf4d42e90834
--- /dev/null
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
@@ -0,0 +1,176 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+class SubNestedSequenceLayer : public Layer {
+public:
+  explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+private:
+  /*
+   * This functions generates the indices of rows in a batch according to the
+   * indices of selected sub-sequence in each sequence.
+   *
+   * Examples:
+   * selectedIndices:
+   *   [
+   *     [0, 1, -1],
+   *     [0, 1, 2],
+   *     [0, -1, -1],
+   *     [0, 2, 3],
+   *   ]
+   * inputSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   *
+   * ths output is saved to private member rowIndice_;
+   * [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
+   *  16,17,18,19,20,21,22,23,24,25,26,27]
+   */
+
+  void calSelectedCols(const MatrixPtr selectedIndices,
+                       const std::vector<std::vector<int>>& inputSeqInfo);
+
+  // if the second input of this layer is on GPU memory, copy it to CPU memory.
+  MatrixPtr selIdsCpu_;
+
+  // reorganized sequenceStartPositions and subSequenceStartPositions
+  // into a 2d vector to facilitate the sequence selection process.
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+
+  // the final selected row indices in a batch,
+  // rowIdx_ and selectedRows_ actually share a same memory.
+  IVectorPtr rowIndice_;
+  std::vector<int> selectedRows_;
+};
+
+REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer);
+
+bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(2U, inputLayers_.size());
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SubNestedSequenceLayer::calSelectedCols(
+    const MatrixPtr selectedIndices,
+    const std::vector<std::vector<int>>& inputSeqInfo) {
+  selectedRows_.clear();
+
+  std::vector<int> outSeqStartInfo(1, 0);
+  std::vector<int> outSubSeqStartInfo(1, 0);
+
+  size_t seqNum = selectedIndices->getHeight();
+  size_t beamSize = selectedIndices->getWidth();
+  for (size_t i = 0; i < seqNum; ++i) {
+    for (size_t j = 0; j < beamSize; ++j) {
+      if (selectedIndices->getElement(i, j) == -1.) break;
+      int selSubSeqIdx = selectedIndices->getElement(i, j);
+      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
+
+      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
+                         inputSeqInfoVec_[i][selSubSeqIdx];
+      for (size_t k = 0; k < subSeqLen; ++k)
+        selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k);
+      outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen);
+    }
+    outSeqStartInfo.push_back(outSubSeqStartInfo.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartInfo.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartInfo.data(), outSeqStartInfo.size(), false);
+
+  ICpuGpuVector::resizeOrCreate(
+      output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false);
+  output_.subSequenceStartPositions->copyFrom(
+      outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false);
+}
+
+void SubNestedSequenceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
+                              << "must be a nested sequence.";
+  const MatrixPtr selectedIndices = getInputValue(1);
+  CHECK_EQ(inputSeq.getNumSequences(), selectedIndices->getHeight());
+
+  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
+    /*
+     * Currently, the second input for this layer is generated by
+     * kmax_sequence_score_layer whose output is always stored on CPU,
+     * or a data_layer which canbe on GPU.
+     *
+     * If the second input is on GPU, copy it to CPU memory, because this
+     * input always uses very few memory, and operations related to it are
+     * all logic control, not computations.
+     */
+    Matrix::resizeOrCreate(selIdsCpu_,
+                           selectedIndices->getHeight(),
+                           selectedIndices->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    selIdsCpu_->copyFrom(*selectedIndices);
+  } else {
+    selIdsCpu_ = selectedIndices;
+  }
+
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  calSelectedCols(selIdsCpu_, inputSeqInfoVec_);
+
+  resetOutput(selectedRows_.size(), getSize());
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SubNestedSequenceLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inputSeqGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 5511ab6b8bb05108e76cc0913264d864d2fecf5b..209d0ab9c8d7e8463c8636b1412622a94f359fb1 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -66,6 +66,16 @@ add_unittest_without_exec(test_BatchNorm
 
 add_test(NAME test_BatchNorm
     COMMAND test_BatchNorm)
+
+
+################# test_KmaxSeqScore #######################
+add_unittest_without_exec(test_KmaxSeqScore
+    test_KmaxSeqScore.cpp
+    LayerGradUtil.cpp)
+
+add_test(NAME test_KmaxSeqScore
+    COMMAND test_KmaxSeqScore)
+
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
     test_Evaluator.cpp)
diff --git a/paddle/gserver/tests/test_KmaxSeqScore.cpp b/paddle/gserver/tests/test_KmaxSeqScore.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f958b4974d45ef65f8f374148a31ad3a6ce7632f
--- /dev/null
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
@@ -0,0 +1,160 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+vector<int> randSampling(int range, int n) {
+  CHECK_GE(range, n);
+  vector<int> num(range);
+  iota(begin(num), end(num), 0);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  return num;
+}
+
+void genRandomSeqInfo(vector<int>& seqStartPosition,
+                      vector<int>& subSeqStartPosition) {
+  const int maxSeqNum = 100;
+  // generate random start position information
+  int seqNum = 1 + (rand() % maxSeqNum);
+  seqStartPosition.resize(seqNum + 1, 0);
+  subSeqStartPosition.resize(1, 0);
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqLen = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqLen; ++j)
+      subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen);
+    seqStartPosition[i + 1] = subSeqStartPosition.back();
+  }
+}
+
+void genRandomGroundTruth(real* values,
+                          vector<vector<int>>& groundTruth,
+                          vector<int>& startPos,
+                          size_t beamSize) {
+  groundTruth.resize(startPos.size() - 1, vector<int>(beamSize, -1));
+  for (size_t i = 0; i < startPos.size() - 1; ++i) {
+    int seqLen = startPos[i + 1] - startPos[i];
+    vector<int> pos =
+        randSampling(seqLen, min(static_cast<int>(beamSize), seqLen));
+    for (size_t j = 0; j < pos.size(); ++j) {
+      groundTruth[i][j] = pos[j];
+      values[startPos[i] + pos[j]] = 1.;
+    }
+  }
+}
+
+void checkLayerOut(vector<vector<int>> groundTruth,
+                   real* layerOut,
+                   size_t beamSize) {
+  for (size_t i = 0; i < groundTruth.size(); ++i) {
+    int begPos = i * beamSize;
+    vector<real> tmp(layerOut + begPos, layerOut + begPos + beamSize);
+    sort(begin(tmp), end(tmp));
+    sort(begin(groundTruth[i]), end(groundTruth[i]));
+    for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]);
+  }
+}
+
+TEST(Layer, kmaxSeqScoreLayer) {
+  const size_t maxBeamSize = 100;
+  int beamSize = 1 + (rand() % maxBeamSize);
+
+  vector<int> seqStartPosition;
+  vector<int> subSeqStartPosition;
+  genRandomSeqInfo(seqStartPosition, subSeqStartPosition);
+  MatrixPtr inValue =
+      Matrix::create(subSeqStartPosition.back(), 1, false, false);
+
+  for (auto hasSubseq : {false, true}) {
+    vector<vector<int>> groundTruth;
+    inValue->randomizeUniform();
+    genRandomGroundTruth(inValue->getData(),
+                         groundTruth,
+                         hasSubseq ? subSeqStartPosition : seqStartPosition,
+                         beamSize);
+
+    for (auto useGpu : {false, true}) {
+      TestConfig config;
+      config.layerConfig.set_type("kmax_seq_score");
+      config.layerConfig.set_beam_size(beamSize);
+
+      if (hasSubseq) {
+        config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                    "scores",
+                                    inValue,
+                                    seqStartPosition,
+                                    subSeqStartPosition});
+      } else {
+        config.inputDefs.push_back(
+            {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition});
+      }
+      config.layerConfig.add_inputs();
+
+      // data layer initialize
+      std::vector<DataLayerPtr> dataLayers;
+      LayerMap layerMap;
+      vector<Argument> datas;
+      initDataLayer(
+          config,
+          &dataLayers,
+          &datas,
+          &layerMap,
+          "kmax_seq_score",
+          100 /* actually this parameter is unused in self-defined input*/,
+          false,
+          useGpu);
+      // test layer initialize
+      std::vector<ParameterPtr> parameters;
+      LayerPtr kmaxSeqScoreLayer;
+      FLAGS_use_gpu = useGpu;
+      initTestLayer(config, &layerMap, &parameters, &kmaxSeqScoreLayer);
+      kmaxSeqScoreLayer->forward(PASS_TRAIN);
+
+      const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue();
+      CHECK_EQ(outValue->getHeight(),
+               hasSubseq ? subSeqStartPosition.size() - 1
+                         : seqStartPosition.size() - 1);
+      CHECK_EQ(outValue->getWidth(), beamSize);
+      checkLayerOut(groundTruth, outValue->getData(), beamSize);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand((size_t)(time(NULL)));
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index fe11278f41c0118ee0bdb34f17fbf9602e0fa76b..0f312b6ca50bc1e6317251ba785f1c61a224b54e 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1899,6 +1899,84 @@ TEST(Layer, CropLayer) {
   }
 }
 
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+TEST(Layer, SubNestedSequenceLayer) {
+  // layer size is not crutial for this layer,
+  // so use a small layer size in unittest
+  const int layerSize = 4;
+
+  const int maxSeqNum = 50;
+  const int maxSeqLen = 50;
+  const int maxBeamSize = 32;
+
+  srand((size_t)(time(NULL)));
+  int beamSize = 1 + (rand() % maxBeamSize);
+
+  TestConfig config;
+  config.layerConfig.set_type("sub_nested_seq");
+  config.layerConfig.set_name("sub_nested_seq_layer");
+  config.layerConfig.set_size(layerSize);
+
+  int seqNum = 1 + (rand() % maxSeqNum);
+
+  // sequence information for the first input, it is a nested sequence
+  vector<int> seqStartPos(seqNum + 1, 0);
+  vector<int> subSeqStartPos(1, 0);
+
+  // selected indices
+  MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false);
+  selectedIndices->one();
+  selectedIndices->mulScalar(-1.);
+  real* indicesData = selectedIndices->getData();
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqNum; ++j) {
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % maxSeqLen)));
+    }
+    vector<real> selSeqs =
+        randSampling(static_cast<real>(subSeqNum), min(beamSize, subSeqNum));
+    memcpy(indicesData + (i * beamSize),
+           selSeqs.data(),
+           selSeqs.size() * sizeof(real));
+    seqStartPos[i + 1] = subSeqStartPos.back();
+  }
+
+  MatrixPtr seqInputPtr =
+      Matrix::create(seqStartPos.back(), layerSize, false, false);
+  seqInputPtr->randomizeUniform();
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                              "nested_seq_input",
+                              seqInputPtr,
+                              seqStartPos,
+                              subSeqStartPos});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sub_nested_seq",
+                  /* batchSize */ seqNum,
+                  /* trans */ false,
+                  /* useGpu*/ useGpu,
+                  /* useWeight */ false);
+  }
+}
+
 TEST(Layer, ClipLayer) {
   const size_t batchSize = 128;
   const size_t size = 512;
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 2cf15ff69a7918fdf498fafaed8cbbc113a8e982..3ce18da887a1bf850b2e5c82b4c0040ac1eb3a1f 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -59,6 +59,7 @@ op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
 op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu)
 
 op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
+cc_test(sgd_op_test SRCS sgd_op_test.cc DEPS sgd_op)
 
 op_library(fc_op
     SRCS fc_op.cc
@@ -66,3 +67,5 @@ op_library(fc_op
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
     DEPS framework_proto tensor op_registry operator net_op)
 cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op)
+op_library(uniform_random_op
+        SRCS uniform_random_op.cc uniform_random_op.cu)
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index 29943002acab6db42dfdad2bc32f4c8725901136..adb1c4f0412dd213f0505c4b3e96e9a17cbf6b48 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-class AddOp : public OperatorWithKernel {
+class AddOp : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
     PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
                       ctx.Input<Tensor>("Y")->dims(),
                       "Two input of Add Op's dimension must be same.");
@@ -27,9 +27,9 @@ class AddOp : public OperatorWithKernel {
   }
 };
 
-class AddOpMaker : public OpProtoAndCheckerMaker {
+class AddOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  AddOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of add op");
     AddInput("Y", "The second input of add op");
@@ -42,14 +42,17 @@ The equation is: Out = X + Y
   }
 };
 
-class AddOpGrad : public OperatorWithKernel {
+class AddOpGrad : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {}
+  void InferShape(const framework::InferShapeContext &ctx) const override {}
 };
 
 }  // namespace operators
 }  // namespace paddle
 
+namespace ops = paddle::operators;
 REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker);
 REGISTER_GRADIENT_OP(add_two, add_two_grad, ops::AddOpGrad);
-REGISTER_OP_CPU_KERNEL(add_two, ops::AddKernel<ops::CPUPlace, float>);
+
+REGISTER_OP_CPU_KERNEL(add_two,
+                       ops::AddKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
index 9bd08634da96c5595d6dd702ad9afafb94632b03..cec5f558cbc161124620ad4241d6bd8a5324277c 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -16,4 +16,6 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"
 
-REGISTER_OP_GPU_KERNEL(add_two, ops::AddKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(add_two,
+                       ops::AddKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
index 9310c1f7edfd2059e5f31486388a17a04359be63..a7307b6818aa3d10ff215d06281e2b53196fd101 100644
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -13,15 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class AddKernel : public OpKernel {
+class AddKernel : public framework::OpKernel {
  public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
     auto* input0 = context.Input<Tensor>("X");
     auto* input1 = context.Input<Tensor>("Y");
     auto* output = context.Output<Tensor>("Out");
diff --git a/paddle/operators/add_op_test.cc b/paddle/operators/add_op_test.cc
index 3d52f5498323dbb7ca0ff25d038947f0ddb2017e..bf529defb20d27200a28666278db8607b986e2d5 100644
--- a/paddle/operators/add_op_test.cc
+++ b/paddle/operators/add_op_test.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #define private public
-#include <paddle/framework/op_registry.h>
+#include "paddle/framework/op_registry.h"
+
 USE_OP(add_two);
-// USE_OP(add_two_grad);
 
 TEST(AddOp, GetOpProto) {
   auto& protos = paddle::framework::OpRegistry::protos();
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 77c8271fd4ca490afba11abc109f5c4296f0d1fd..7cb2aa4e78cdf7de98fb6488d2398ac811256f9e 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-class OnehotCrossEntropyOp : public OperatorWithKernel {
+class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
     auto *X = ctx.Input<Tensor>("X");
     auto *label = ctx.Input<Tensor>("label");
 
@@ -30,9 +30,9 @@ class OnehotCrossEntropyOp : public OperatorWithKernel {
   }
 };
 
-class OnehotCrossEntropyGradientOp : public OperatorWithKernel {
+class OnehotCrossEntropyGradientOp : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
     auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto X = ctx.Input<Tensor>("X");
 
@@ -41,9 +41,10 @@ class OnehotCrossEntropyGradientOp : public OperatorWithKernel {
   }
 };
 
-class OnehotCrossEntropyOpMaker : public OpProtoAndCheckerMaker {
+class OnehotCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  OnehotCrossEntropyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  OnehotCrossEntropyOpMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of OnehotCrossEntropyOp");
     AddInput("label", "The second input of OnehotCrossEntropyOp");
@@ -59,11 +60,14 @@ OnehotCrossEntropy Operator.
 }  // namespace operators
 }  // namespace paddle
 
+namespace ops = paddle::operators;
 REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
             ops::OnehotCrossEntropyOpMaker);
-REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
-                       ops::OnehotCrossEntropyOpKernel<ops::CPUPlace, float>);
-
+REGISTER_OP_CPU_KERNEL(
+    onehot_cross_entropy,
+    ops::OnehotCrossEntropyOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_GRADIENT_OP(onehot_cross_entropy, onehot_cross_entropy_grad,
+                     ops::OnehotCrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(
     onehot_cross_entropy_grad,
-    ops::OnehotCrossEntropyGradientOpKernel<ops::CPUPlace, float>);
+    ops::OnehotCrossEntropyGradientOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index ec73721a810fa86d65409f643401eb77248ad5de..4bbc8f093a794d46737a16488684a6a0cc25e285 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -14,3 +14,8 @@
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/cross_entropy_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    onehot_cross_entropy,
+    ops::OnehotCrossEntropyOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index d5e3f29332809a63908ecc896b33d2adff6abe45..b7df92c9a98ebf12b72a8d3d8e8e4e1a950f06c9 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 T tolerable_value(T x) {
   static_assert(std::is_floating_point<T>::value,
@@ -38,9 +40,9 @@ T tolerable_value(T x) {
 }
 
 template <typename Place, typename T>
-class OnehotCrossEntropyOpKernel : public OpKernel {
+class OnehotCrossEntropyOpKernel : public framework::OpKernel {
  public:
-  void Compute(const ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
     auto X = ctx.Input<Tensor>("X");
     const T* Xdata = X->data<T>();
     const int* label_data = ctx.Input<Tensor>("label")->data<int>();
@@ -61,9 +63,9 @@ class OnehotCrossEntropyOpKernel : public OpKernel {
 };
 
 template <typename Place, typename T>
-class OnehotCrossEntropyGradientOpKernel : public OpKernel {
+class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
  public:
-  void Compute(const ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
     auto X = ctx.Input<Tensor>("X");
     auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc
index 0eccc5fe4c481a63f6b1dca00e4cc5328c8f9834..8453267b4addac44fa098b4d9a51f0780f068376 100644
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -12,11 +12,16 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "type_alias.h"
+#include "paddle/operators/net_op.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+using OpRegistry = framework::OpRegistry;
+
 class FullyConnectedOp : public NetOp {
  public:
   void Init() override {
@@ -39,9 +44,10 @@ class FullyConnectedOp : public NetOp {
   }
 };
 
-class FullyConnectedOpMaker : public OpProtoAndCheckerMaker {
+class FullyConnectedOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FullyConnectedOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  FullyConnectedOpMaker(framework::OpProto *proto,
+                        framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "the input of fc operator");
     AddInput("W", "the weight of fc operator");
@@ -66,4 +72,5 @@ USE_OP(rowwise_add);
 USE_OP(sigmoid);
 USE_OP(softmax);
 
+namespace ops = paddle::operators;
 REGISTER_OP(fc, ops::FullyConnectedOp, ops::FullyConnectedOpMaker);
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index 405ed219f0145ec1c3d8a315f39a134d3d9d7643..04a820b6168b0e696024ed9f8cded6a9d1e45e9d 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -42,8 +42,8 @@ The output will have the same size with input.
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(fill_zeros_like, paddle::operators::FillZerosLikeOp,
-            paddle::operators::FillZerosLikeOpMaker);
+namespace ops = paddle::operators;
+REGISTER_OP(fill_zeros_like, ops::FillZerosLikeOp, ops::FillZerosLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
     fill_zeros_like,
-    paddle::operators::FillZerosLikeKernel<paddle::platform::CPUPlace, float>);
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/fill_zeros_like_op.cu b/paddle/operators/fill_zeros_like_op.cu
index 4f1054cf47e35572dbbc51ca742994065a027919..fdbcf520a0d7b4ddfe3fc1837a21e0ce88b8e8fa 100644
--- a/paddle/operators/fill_zeros_like_op.cu
+++ b/paddle/operators/fill_zeros_like_op.cu
@@ -16,6 +16,7 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/fill_zeros_like_op.h"
 
+namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
     fill_zeros_like,
-    paddle::operators::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index dfaed2c9aaf2bf5c1a9b803fc9c8b9ea0e5c5d4e..f846c7a8ab15e2cd997564edb36660a1360227a8 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index aa5479ceaff379d99a5f7322edac0c5e69816974..2121faf15b9fcce5efd9fa8d63dbc298bb2ed527 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -17,18 +17,18 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-class MeanOp : public OperatorWithKernel {
+class MeanOp : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE(ctx.InputVar("X") != nullptr,
-                   "Input of MeanOp must be initialized.");
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input of MeanOp must be initialized.");
     ctx.Output<Tensor>("Out")->Resize({1});
   }
 };
 
-class MeanOpMaker : public OpProtoAndCheckerMaker {
+class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  MeanOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
     AddOutput("Out", "The output of mean op").IgnoreGradient();
@@ -36,9 +36,9 @@ class MeanOpMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-class MeanGradOp : public OperatorWithKernel {
+class MeanGradOp : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
     ctx.Output<Tensor>("X" + framework::kGradVarSuffix)
         ->Resize(ctx.Input<Tensor>("X")->dims());
   }
@@ -47,7 +47,10 @@ class MeanGradOp : public OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
+namespace ops = paddle::operators;
 REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker);
-REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mean,
+                       ops::MeanKernel<paddle::platform::CPUPlace, float>);
 REGISTER_GRADIENT_OP(mean, mean_grad, ops::MeanGradOp);
-REGISTER_OP_CPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mean_grad,
+                       ops::MeanGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu
index 8b97b0154ccdc8c41a90f7580af829c5c8663b60..7af624d81dc5ffbb5c31b4d6f6eb8f9f8652a431 100644
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
@@ -16,5 +16,8 @@
 
 #include "paddle/operators/mean_op.h"
 
-REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<ops::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(mean,
+                       ops::MeanKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mean_grad,
+                       ops::MeanGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
index 40a1e2d099acad90b1bbac50f62ea7c4f691c1b4..f3db0a29bb234948d180d964fb82057632ec4414 100644
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -13,15 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class MeanKernel : public OpKernel {
+class MeanKernel : public framework::OpKernel {
  public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
     auto input = context.Input<Tensor>(0);
     auto output = context.Output<Tensor>(0);
 
@@ -36,9 +45,9 @@ class MeanKernel : public OpKernel {
 };
 
 template <typename Place, typename T>
-class MeanGradKernel : public OpKernel {
+class MeanGradKernel : public framework::OpKernel {
  public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
     auto OG = context.Input<Tensor>("Out" + framework::kGradVarSuffix);
     PADDLE_ENFORCE(framework::product(OG->dims()) == 1,
                    "Mean Gradient should be scalar");
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index b9099ad4e3d000df45a8df1b6f8f2b27b197d744..9c570cff28e7c6da3d377482eced7eb12a5e1122 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -17,9 +17,9 @@
 namespace paddle {
 namespace operators {
 
-class MulOp : public OperatorWithKernel {
+class MulOp : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
     auto dim0 = ctx.Input<Tensor>("X")->dims();
     auto dim1 = ctx.Input<Tensor>("Y")->dims();
     PADDLE_ENFORCE_EQ(dim0.size(), 2,
@@ -35,9 +35,9 @@ class MulOp : public OperatorWithKernel {
   }
 };
 
-class MulOpMaker : public OpProtoAndCheckerMaker {
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of mul op");
     AddInput("Y", "The second input of mul op");
@@ -50,9 +50,9 @@ The equation is: Out = X * Y
   }
 };
 
-class MulOpGrad : public OperatorWithKernel {
+class MulOpGrad : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {}
+  void InferShape(const framework::InferShapeContext &ctx) const override {}
   std::string DebugString() const override {
     LOG(INFO) << "MulGrad";
     return "";
@@ -62,7 +62,8 @@ class MulOpGrad : public OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
+namespace ops = paddle::operators;
 REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker);
 REGISTER_GRADIENT_OP(mul, mul_grad, ops::MulOpGrad);
 
-REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
index 1dc04c4297daed7a7861a09cf6b99446c296ffa5..43debbc21a365a15c914e60e151f7782b82080cb 100644
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -15,4 +15,6 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"
 
-REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index 7ecd6e8ac01c9efeabe9d2873da39503966ba8df..ab12631c03453a18fbb067e2d12c2bc332acd567 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -13,16 +13,21 @@
    limitations under the License. */
 
 #pragma once
-
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class MulKernel : public OpKernel {
+class MulKernel : public framework::OpKernel {
  public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
     Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
         {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};
 
@@ -40,5 +45,6 @@ class MulKernel : public OpKernel {
     Z.device(place) = X.contract(Y, dim_pair);
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
index 0342cf4adb831e394935083f77673611d26e2e55..4e2353aa2bd8cf20f6a7feedad71d83efd819a47 100644
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -16,10 +16,6 @@ limitations under the License. */
 
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/scope.h"
-#include "paddle/operators/type_alias.h"
-#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -64,20 +60,29 @@ class NetOp : public framework::OperatorBase {
     }
   }
 
+  bool SupportGPU() const override {
+    for (auto& op : ops_) {
+      if (!op->SupportGPU()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   /**
    * @brief Add an operator by ptr
    */
   void AddOp(const std::shared_ptr<OperatorBase>& op) {
     PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
-    PADDLE_ENFORCE(op != nullptr, "Cannot Insert Null op");
+    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
     ops_.push_back(op);
   }
 
   void InsertOp(size_t pos, const std::shared_ptr<OperatorBase>& op) {
     PADDLE_ENFORCE(!add_op_done_,
                    "Cannot InsertOp when this network is sealed");
-    PADDLE_ENFORCE(op != nullptr, "Cannot Insert Null op");
-    PADDLE_ENFORCE(pos <= ops_.size(), "Out of range");
+    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
+    PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range");
     ops_.insert(ops_.begin() + pos, op);
   }
 
diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc
index eb9832dc2c7fd6368b979e742ab4eb683fdaedbe..977f3de706f8c81933ab751385f3d6f999f874b4 100644
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -2,31 +2,27 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
 namespace paddle {
 namespace operators {
+using Scope = framework::Scope;
+using DeviceContext = platform::DeviceContext;
 
 static int infer_shape_cnt = 0;
 static int run_cnt = 0;
 
-class TestOp : public OperatorBase {
+class TestOp : public framework::OperatorBase {
  public:
-  void InferShape(const framework::Scope& scope) const override {
-    ++infer_shape_cnt;
-  }
-  void Run(const framework::Scope& scope,
-           const paddle::platform::DeviceContext& dev_ctx) const override {
+  void InferShape(const Scope& scope) const override { ++infer_shape_cnt; }
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
     ++run_cnt;
   }
 };
 
-class EmptyOp : public OperatorBase {
+class EmptyOp : public framework::OperatorBase {
  public:
   void InferShape(const Scope& scope) const override {}
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const DeviceContext& dev_ctx) const override {}
 };
 
 template <typename T>
@@ -73,7 +69,7 @@ TEST(OpKernel, all) {
   net->Run(scope, dev_ctx);
   ASSERT_EQ(2, infer_shape_cnt);
   ASSERT_EQ(2, run_cnt);
-  ASSERT_THROW(net->AddOp(op2), paddle::platform::EnforceNotMet);
+  ASSERT_THROW(net->AddOp(op2), platform::EnforceNotMet);
 }
 
 TEST(NetOp, insert_op) {
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 43c9aa72cd724a792c05501330d1ffee34cde632..4ed338359e62816ed2c4b58b512db8b230f2f786 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -14,17 +14,19 @@
 
 #include "paddle/operators/recurrent_op.h"
 
-#include <glog/logging.h>
 #include <cstring>
 #include <sstream>
 
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
-#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace operators {
 
+using Scope = framework::Scope;
+using Variable = framework::Variable;
+using Tensor = framework::Tensor;
+
 void RecurrentAlgorithm::InferShape(const Scope& scope) const {
   seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
                  ->GetMutable<Tensor>()
@@ -140,10 +142,11 @@ void RecurrentOp::Init() {
   alg_.Init(std::move(arg));
 }
 
-class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class RecurrentAlgorithmProtoAndCheckerMaker
+    : public framework::OpProtoAndCheckerMaker {
  public:
-  RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto,
-                                         OpAttrChecker* op_checker)
+  RecurrentAlgorithmProtoAndCheckerMaker(framework::OpProto* proto,
+                                         framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     const auto& name = RecurrentOp::kArgName;
     // inputs and outputs stored in proto
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index 32c6c2dd4efa85359b4e95471e8ba09e56afec57..7e4770630ed2a49214194689aa489e6ab8e476da 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -18,7 +18,9 @@ namespace paddle {
 namespace operators {
 namespace rnn {
 
-namespace fmw = paddle::framework;
+namespace f = paddle::framework;
+
+using Tensor = framework::Tensor;
 
 void SegmentInputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<Link>& inlinks, const size_t seq_len,
@@ -30,10 +32,10 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
                    inlinks[i].external);
 
     Tensor* input = input_var->GetMutable<Tensor>();
-    fmw::DDim dims = input->dims();
+    f::DDim dims = input->dims();
     PADDLE_ENFORCE(static_cast<size_t>(dims[0]) == seq_len,
                    "all the inlinks must have same length");
-    fmw::DDim step_dims = slice_ddim(dims, 1, dims.size());
+    f::DDim step_dims = slice_ddim(dims, 1, dims.size());
     for (size_t j = 0; j < seq_len; j++) {
       Tensor* step_input =
           step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable<Tensor>();
@@ -58,11 +60,10 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
       auto step_scope_var = step_scopes[0]->FindVar(outlinks[i].internal);
       PADDLE_ENFORCE(step_scope_var != nullptr, "%s not in scope",
                      outlinks[i].internal);
-      fmw::DDim step_dims =
-          step_scope_var->template GetMutable<Tensor>()->dims();
+      f::DDim step_dims = step_scope_var->template GetMutable<Tensor>()->dims();
       std::vector<int> dims_vec = vectorize(step_dims);
       dims_vec.insert(dims_vec.begin(), seq_len);
-      output->Resize(fmw::make_ddim(dims_vec));
+      output->Resize(f::make_ddim(dims_vec));
     } else {
       output->mutable_data<float>(platform::CPUPlace());
       for (size_t j = 0; j < seq_len; j++) {
@@ -104,7 +105,7 @@ void LinkMemories(const std::vector<Scope*>& scopes,
 }
 
 void InitArgument(const ArgumentName& name, Argument* arg,
-                  const OperatorBase& op) {
+                  const framework::OperatorBase& op) {
   arg->step_net = op.Input(name.step_net);
   arg->step_scopes = op.Output(name.step_scopes);
 
diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h
index 379754b98fcead6debe0a60efa62fce4b7761940..17941c503cfcc83415b8bc635623a2c2ce2981c3 100644
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
@@ -17,12 +17,13 @@
 #include <string>
 
 #include "paddle/framework/operator.h"
-#include "paddle/operators/type_alias.h"
 
 namespace paddle {
 namespace operators {
 namespace rnn {
 
+using Scope = framework::Scope;
+
 /**
  * Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
  *
@@ -86,7 +87,7 @@ void LinkMemories(const std::vector<Scope*>& step_scopes,
                   const int offset, bool infer_shape_mode);
 
 void InitArgument(const ArgumentName& name, Argument* arg,
-                  const OperatorBase& op);
+                  const framework::OperatorBase& op);
 
 }  // namespace rnn
 }  // namespace operators
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index c6a1f082138f5a75e2d9cebbe7b547911e952e17..28b56a6934a9f03fc779671bf49fbc4ef3bcee90 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -13,12 +13,13 @@
    limitations under the License. */
 
 #include "paddle/operators/rowwise_add_op.h"
+
 namespace paddle {
 namespace operators {
 
-class RowWiseAddOp : public OperatorWithKernel {
+class RowWiseAddOp : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
     auto dim0 = ctx.Input<Tensor>("X")->dims();
     auto dim1 = ctx.Input<Tensor>("b")->dims();
 
@@ -30,9 +31,10 @@ class RowWiseAddOp : public OperatorWithKernel {
   }
 };
 
-class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
+class RowWiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  RowWiseAddOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The left input of row-wise add op, must be matrix");
     AddInput("b", "The right input of row-wise add op, must be vector");
@@ -48,6 +50,7 @@ for i in xrange(X.shape[0]):
 }  // namespace operators
 }  // namespace paddle
 
+namespace ops = paddle::operators;
 REGISTER_OP(rowwise_add, ops::RowWiseAddOp, ops::RowWiseAddOpMaker);
-REGISTER_OP_CPU_KERNEL(rowwise_add,
-                       ops::RowWiseAddKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    rowwise_add, ops::RowWiseAddKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
index f76faa0a3a93a1ac277a1d1aa83c3fa6c3944648..86f80b81228a69ac4c05a4693901570f2b9966e0 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -15,5 +15,6 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/rowwise_add_op.h"
 
-REGISTER_OP_GPU_KERNEL(rowwise_add,
-                       ops::RowWiseAddKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    rowwise_add, ops::RowWiseAddKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index 9e9f9d110c30452439cf4108d4a574b20a581c16..2a67407b52fe4aaf1d66571829c1e6ed7065c224 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -13,15 +13,24 @@
    limitations under the License. */
 
 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class RowWiseAddKernel : public OpKernel {
+class RowWiseAddKernel : public framework::OpKernel {
  public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
     auto out = context.Output<Tensor>(0);
     out->mutable_data<T>(context.GetPlace());
 
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 659cb41d989489d0e0f8d3cea3f5fb06094d23dc..30fe6fd491a8ee4a30f0f317a816ad7c712d96d9 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-class SGDOp : public OperatorWithKernel {
+class SGDOp : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
     PADDLE_ENFORCE(
         ctx.Input<Tensor>("param")->dims() == ctx.Input<Tensor>("grad")->dims(),
         "Two input of SGD Op's dimension must be same.");
@@ -27,9 +27,9 @@ class SGDOp : public OperatorWithKernel {
   }
 };
 
-class SGDOpMaker : public OpProtoAndCheckerMaker {
+class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("param", "input parameter");
     AddInput("grad", "input gradient");
@@ -47,5 +47,7 @@ param_out = param - learning_rate * grad;
 }  // namespace operators
 }  // namespace paddle
 
+namespace ops = paddle::operators;
 REGISTER_OP(sgd, ops::SGDOp, ops::SGDOpMaker);
-REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sgd,
+                       ops::SGDOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index 72629ccfbb8bc8ec53045289bd985c721c62fa10..f5ba6d3c29f8dfbfdea4fbf2c3d5fd7f5b358666 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -15,4 +15,6 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/sgd_op.h"
 
-REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(sgd,
+                       ops::SGDOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index bf5b195933fce7faa46bcc96032e784076178cf7..bfb449d0b029409eda4177fc7643810ee6a1df3d 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -13,15 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class SGDOpKernel : public OpKernel {
+class SGDOpKernel : public framework::OpKernel {
  public:
-  void Compute(const ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
     auto param = ctx.Input<Tensor>("param");
     auto grad = ctx.Input<Tensor>("grad");
     auto param_out = ctx.Output<Tensor>(0);
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index 27904ea0c3c21ec760dfed1b822bd45b586a933d..315887d8c4035054b993805c77f811d963d8508c 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -13,19 +13,21 @@
    limitations under the License. */
 
 #include "paddle/operators/sigmoid_op.h"
+
 namespace paddle {
 namespace operators {
 
-class SigmoidOp : public OperatorWithKernel {
+class SigmoidOp : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
     ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
 
-class SigmoidOpMaker : public OpProtoAndCheckerMaker {
+class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SigmoidOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "sigmoid input");
     AddOutput("Y", "sigmoid output");
@@ -33,9 +35,9 @@ class SigmoidOpMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-class SigmoidOpGrad : public OperatorWithKernel {
+class SigmoidOpGrad : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
     ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
   }
 };
@@ -43,9 +45,11 @@ class SigmoidOpGrad : public OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
+namespace ops = paddle::operators;
 REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker);
 REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, ops::SigmoidOpGrad);
 
-REGISTER_OP_CPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(sigmoid_grad,
-                       ops::SigmoidGradKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sigmoid,
+                       ops::SigmoidKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu
index e80ba081f2ff805664cf92f3cb47e9ad51889058..1a50dfe14a7b9e2614aadb7729de9f9e461e9905 100644
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
@@ -15,6 +15,9 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/sigmoid_op.h"
 
-REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(sigmoid_grad,
-                       ops::SigmoidGradKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(sigmoid,
+                       ops::SigmoidKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
index d513261e74423ce93a50eaaaec1c7d5fadb8f4a8..7af879b2091e4a7f80a3a64be029394156650c23 100644
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -13,16 +13,21 @@
    limitations under the License. */
 
 #pragma once
-
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class SigmoidKernel : public OpKernel {
+class SigmoidKernel : public framework::OpKernel {
  public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
     auto input = context.Input<Tensor>(0);
     auto output = context.Output<Tensor>(0);
     output->mutable_data<T>(context.GetPlace());
@@ -37,9 +42,9 @@ class SigmoidKernel : public OpKernel {
 };
 
 template <typename Place, typename T>
-class SigmoidGradKernel : public OpKernel {
+class SigmoidGradKernel : public framework::OpKernel {
  public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
     auto Y_t = context.Input<Tensor>("Y");
     auto dY_t = context.Input<Tensor>(framework::GradVarName("Y"));
     auto dX_t = context.Output<Tensor>(framework::GradVarName("X"));
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 836bce2294f52af3062fe2bde82fb1df2207f54c..962787fffd8c5efaf42319b19dfcd071c48ba2bd 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -17,18 +17,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-class SoftmaxOp : public OperatorWithKernel {
+class SoftmaxOp : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
     PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
                    "The input of softmax op must be matrix");
     ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
 
-class SoftmaxOpMaker : public OpProtoAndCheckerMaker {
+class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftmaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SoftmaxOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "input of softmax");
     AddOutput("Y", "output of softmax");
@@ -36,12 +37,12 @@ class SoftmaxOpMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-class SoftmaxOpGrad : public OperatorWithKernel {
+class SoftmaxOpGrad : public framework::OperatorWithKernel {
  protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
     PADDLE_ENFORCE(ctx.InputVar("Y") != nullptr, "Input(Y) should not be null");
-    PADDLE_ENFORCE(ctx.InputVar(framework::GradVarName("Y")) != nullptr,
-                   "Input(Y@GRAD) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
+                            "Input(Y@GRAD) should not be null");
     PADDLE_ENFORCE(ctx.Input<Tensor>("Y")->dims() ==
                        ctx.Input<Tensor>(framework::GradVarName("Y"))->dims(),
                    "the shape of Input(0) and Input(1) should be the same");
@@ -53,8 +54,11 @@ class SoftmaxOpGrad : public OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
+namespace ops = paddle::operators;
+
 REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
-REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(softmax,
+                       ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
 REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad);
-REGISTER_OP_CPU_KERNEL(softmax_grad,
-                       ops::SoftmaxGradKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
index b79228580a7ea0f70b62eb2dc7a61cf85bc0b5fb..2e99a89699dbdcafc8055c47debf9e49f10507e6 100644
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -13,9 +13,11 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/softmax_op.h"
 
-REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel<ops::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(softmax_grad,
-                       ops::SoftmaxGradKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(softmax,
+                       ops::SoftmaxKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index b2dbcf57edf1a64da8da0d9a4c14d708eec17f3f..4fa6b59540498638c3b7df639ae10a66c0fa1c16 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -13,19 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class SoftmaxKernel : public OpKernel {
+class SoftmaxKernel : public framework::OpKernel {
  public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
     auto input = context.Input<Tensor>("X");
     auto output = context.Output<Tensor>("Y");
     output->mutable_data<T>(context.GetPlace());
@@ -62,9 +64,9 @@ class SoftmaxKernel : public OpKernel {
 };
 
 template <typename Place, typename T>
-class SoftmaxGradKernel : public OpKernel {
+class SoftmaxGradKernel : public framework::OpKernel {
  public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
     std::shared_ptr<Tensor> scale_ = std::make_shared<Tensor>();
 
     auto Y = context.Input<Tensor>("Y");
diff --git a/paddle/operators/type_alias.h b/paddle/operators/type_alias.h
deleted file mode 100644
index eac12d35dd8d2977191218167ebb0a6e638d5d73..0000000000000000000000000000000000000000
--- a/paddle/operators/type_alias.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-using OpKernel = framework::OpKernel;
-using OperatorBase = framework::OperatorBase;
-using InferShapeContext = framework::InferShapeContext;
-using ExecutionContext = framework::ExecutionContext;
-using Variable = framework::Variable;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using Tensor = framework::Tensor;
-using Scope = framework::Scope;
-using OperatorWithKernel = framework::OperatorWithKernel;
-using OperatorBase = framework::OperatorBase;
-using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
-using OpProto = framework::OpProto;
-using OpAttrChecker = framework::OpAttrChecker;
-using CPUPlace = platform::CPUPlace;
-using GPUPlace = platform::GPUPlace;
-using OpRegistry = framework::OpRegistry;
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..405b84b76d2e24db25d2ff16e99495f2f132ef09
--- /dev/null
+++ b/paddle/operators/uniform_random_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <random>
+#include <type_traits>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class CPUUniformRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(context.op_.GetAttr<float>("min")),
+        static_cast<T>(context.op_.GetAttr<float>("max")));
+    for (ssize_t i = 0; i < framework::product(tensor->dims()); ++i) {
+      data[i] = dist(engine);
+    }
+  }
+};
+
+class UniformRandomOp : public framework::OperatorWithKernel {
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE(GetAttr<float>("min") < GetAttr<float>("max"),
+                   "uniform_random's min must less then max");
+    auto* tensor = ctx.Output<framework::Tensor>(0);
+    auto dims = GetAttr<std::vector<int>>("dims");
+    tensor->Resize(framework::make_ddim(dims));
+  }
+};
+
+class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  UniformRandomOpMaker(framework::OpProto* proto,
+                       framework::OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "The output tensor of uniform random op");
+    AddComment(R"DOC(Uniform random operator.
+
+Used to initialize tensor with uniform random generator.
+)DOC");
+    AddAttr<std::vector<int>>("dims", "the dimension of random tensor");
+    AddAttr<float>("min", "Minimum value of uniform random").SetDefault(-1.0f);
+    AddAttr<float>("max", "Maximun value of uniform random").SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed of uniform random. "
+                 "0 means generate a seed by system")
+        .SetDefault(0);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(uniform_random, paddle::operators::UniformRandomOp,
+            paddle::operators::UniformRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(uniform_random,
+                       paddle::operators::CPUUniformRandomKernel<float>);
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f1a63e52ec0d3d46a505a89d7d7916bf93a58221
--- /dev/null
+++ b/paddle/operators/uniform_random_op.cu
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct UniformGenerator {
+  T min_, max_;
+  unsigned int seed_;
+
+  __host__ __device__ UniformGenerator(T min, T max, int seed)
+      : min_(min), max_(max), seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class GPUUniformRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    T min = static_cast<T>(context.op_.GetAttr<float>("min"));
+    T max = static_cast<T>(context.op_.GetAttr<float>("max"));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    ssize_t N = framework::product(tensor->dims());
+    thrust::transform(index_sequence_begin, index_sequence_begin + N,
+                      thrust::device_ptr<T>(data),
+                      UniformGenerator<T>(min, max, seed));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_GPU_KERNEL(uniform_random,
+                       paddle::operators::GPUUniformRandomKernel<float>);
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index ef72b973c1a465a8ac03cae1070429160eac0ac1..0547ac93cd183afbcede41d280c6b4b16ed7dab1 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -666,4 +666,24 @@ void Argument::subArgFrom(const Argument& input,
   }
 }
 
+void Argument::reorganizeSeqInfo(
+    const ICpuGpuVectorPtr seqStartPos,
+    const ICpuGpuVectorPtr subSeqStartPos,
+    std::vector<std::vector<int>>& reorganizedSeqInfo) {
+  int* seqStarts = seqStartPos->getMutableData(false);
+  int* subSeqStarts = subSeqStartPos->getMutableData(false);
+
+  int seqNum = seqStartPos->getSize() - 1;
+  reorganizedSeqInfo.resize(seqNum, std::vector<int>());
+  int seqIdx = 0;
+  for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
+    reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+    if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
+      seqIdx++;
+      if (seqIdx == seqNum) return;
+      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+    }
+  }
+}
+
 }  // namespace paddle
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 0ccdef802e71b659788cfd24f28ebe43e1917db1..d8d7a4398f99a2794c5d25528a7d582f5ed629ba 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -317,6 +317,30 @@ struct Argument {
    */
   void printValueString(std::ostream& stream,
                         const std::string& prefix = "") const;
+
+  /**
+   * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and
+   * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo.
+   *
+   * @param seqStartPos: sequenceStartPositions of an Argument.
+   * @param subSeqStartPos: subSequenceStartPositions of an Argument.
+   * @param the reorganized sequence start position information.
+   *
+   * Examples:
+   * seqStartPos: [0, 4, 15, 20, 28]
+   * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28]
+   * reorganizedSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   */
+  static void reorganizeSeqInfo(
+      const ICpuGpuVectorPtr seqStartPos,
+      const ICpuGpuVectorPtr subSeqStartPos,
+      std::vector<std::vector<int>>& reorganizedSeqInfo);
 };
 
 }  // namespace paddle
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 60ce5822d3ade7ea6fa940db2f002a89263a34ca..46293e6c0bb318ec1dadc76a0213ddfdbb8bcd70 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -187,13 +187,9 @@ inline void throw_on_error(T e) {
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
 #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
-
-// if two values have different data types, choose a compatible type for them.
-template <typename T1, typename T2>
-struct CompatibleType {
-  static const bool t1_to_t2 = std::is_convertible<T1, T2>::value;
-  typedef typename std::conditional<t1_to_t2, T2, T1>::type type;
-};
+#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                            \
+  PADDLE_ENFORCE(nullptr != (__VAL), #__VAL " should not be null\n%s", \
+                 paddle::string::Sprintf("" __VA_ARGS__));
 
 template <typename T>
 inline std::string enforce_to_string(const T& val) {
@@ -211,17 +207,12 @@ inline std::string enforce_to_string(const char* const& val) {
 }
 
 #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)        \
-  PADDLE_ENFORCE(__COMPATIBLE_TYPE(__VAL0, __VAL1, __VAL0)                    \
-                     __CMP __COMPATIBLE_TYPE(__VAL0, __VAL1, __VAL1),         \
+  PADDLE_ENFORCE(__VAL0 __CMP __VAL1,                                         \
                  "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \
                  #__VAL0, #__VAL1,                                            \
                  paddle::platform::enforce_to_string(__VAL0),                 \
                  paddle::platform::enforce_to_string(__VAL1),                 \
                  paddle::string::Sprintf("" __VA_ARGS__));
 
-#define __COMPATIBLE_TYPE(__VAL0, __VAL1, __VAL)              \
-  typename paddle::platform::CompatibleType<decltype(__VAL0), \
-                                            decltype(__VAL1)>::type(__VAL)
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
index 7117b49474044af08ae9db79c2fae6693e966af2..4dfb69754608cb1120baa295072c3d031a4e1a7b 100644
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/enforce.h"
+#include <memory>
+
 #include "gtest/gtest.h"
+#include "paddle/platform/enforce.h"
 
 TEST(ENFORCE, OK) {
   PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
@@ -196,3 +198,27 @@ TEST(ENFORCE_LT, FAIL) {
 
   ASSERT_TRUE(in_catch);
 }
+
+TEST(ENFORCE_NOT_NULL, OK) {
+  int* a = new int;
+  PADDLE_ENFORCE_NOT_NULL(a);
+  delete a;
+}
+TEST(ENFORCE_NOT_NULL, FAIL) {
+  bool in_catch = false;
+  int* a{nullptr};
+
+  try {
+    PADDLE_ENFORCE_NOT_NULL(a);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    in_catch = true;
+    const std::string msg = "a should not be null";
+    const char* what = error.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+
+  ASSERT_TRUE(in_catch);
+}
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
deleted file mode 100644
index 8e6b258e00c0012876cda8ffc5b340322d51e894..0000000000000000000000000000000000000000
--- a/paddle/pybind/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-cc_library(paddle_pybind SHARED
-    SRCS pybind.cc
-    DEPS pybind python backward
-	fc_op
-	sgd_op
-	add_op
-	mean_op
-	cross_entropy_op
-	recurrent_op
-	fill_zeros_like_op)
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index ede9e210245df740f13ebb32c98313554f522dd9..44442be4729ff77e8d378c93acebe1486eb75397 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -33,6 +33,9 @@ Configuring cmake in /paddle/build ...
       -DWITH_AVX=${WITH_AVX:-OFF}
       -DWITH_GOLANG=${WITH_GOLANG:-OFF}
       -DWITH_SWIG_PY=ON
+      -DWITH_C_API=${WITH_C_API:-OFF}
+      -DWITH_PYTHON=${WITH_PYTHON:-ON}
+      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
       -DCUDNN_ROOT=/usr/
       -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
       -DWITH_TESTING=${WITH_TESTING:-OFF}
@@ -49,7 +52,9 @@ cmake .. \
       -DWITH_GPU=${WITH_GPU:-OFF} \
       -DWITH_AVX=${WITH_AVX:-OFF} \
       -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
-      -DWITH_SWIG_PY=ON \
+      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
+      -DWITH_C_API=${WITH_C_API:-OFF} \
+      -DWITH_PYTHON=${WITH_PYTHON:-ON} \
       -DCUDNN_ROOT=/usr/ \
       -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
       -DWITH_TESTING=${WITH_TESTING:-OFF} \
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 33fb5d84e2701c163b5d1b1bb3362ee81ebb34ea..dfcff38302703066e868c60e213f0f7cbc55a31e 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -5,15 +5,9 @@ set -e
 mkdir -p $TRAVIS_BUILD_DIR/build
 cd $TRAVIS_BUILD_DIR/build
 
-# Compile paddle binaries first
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_GOLANG=ON -DWITH_STYLE_CHECK=OFF
-
-mkdir output
-make -j `nproc`
-find .. -name '*whl' | xargs pip install  # install all wheels.
-rm -rf *
 # Compile Documentation only.
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+make -j `nproc` gen_proto_py
 make -j `nproc` paddle_docs paddle_docs_cn
 
 # check websites for broken links
@@ -35,6 +29,7 @@ TARGET_BRANCH="gh-pages"
 SOURCE_BRANCH="master"
 
 # Clone the repo to output directory
+mkdir output
 git clone $REPO output
 cd output
 
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 18584cafe7971bad281b498908c54780250791b7..e1cea8bd0de5394020a498725485cea025512e48 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -17,7 +17,7 @@ foreach(filename ${proto_filenames})
             COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
             ARGS "--python_out=${PROJ_ROOT}/python/paddle/proto"
             "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
-            DEPENDS ${ABS_FIL} ${external_project_dependencies})
+            DEPENDS ${ABS_FIL} protoc)
 endforeach()
 
 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 9ea69fc5e57636c22fb20d5d97de760b9cc3bcde..b7b696ef0c13e1bae2e910e08d1a1ea3e45cd5d5 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2657,6 +2657,31 @@ class SubSequenceLayer(LayerBase):
         self.create_bias_parameter(bias, size)
 
 
+@config_layer('sub_nested_seq')
+class SubNestedSequenceLayer(LayerBase):
+    def __init__(self, name, inputs, selected_indices, bias=False, **xargs):
+        if isinstance(inputs, list):
+            assert len(inputs) == 1, ('the first input of sub_nested_seq '
+                                      'layer is a single nested sequence.')
+            inputs = inputs[0]
+        if isinstance(selected_indices, list):
+            assert len(selected_indices) == 1, (
+                'the second input of '
+                'sub_nested_seq layer is a single layer which is a '
+                'set of selected indices.')
+            selected_indices = selected_indices[0]
+
+        super(SubNestedSequenceLayer, self).__init__(
+            name,
+            'sub_nested_seq',
+            0,
+            inputs=[inputs, selected_indices],
+            **xargs)
+        input_layer0 = self.get_input_layer(0)
+        size = input_layer0.size
+        self.set_layer_size(size)
+
+
 @config_layer('out_prod')
 class OuterProdLayer(LayerBase):
     def __init__(self, name, inputs, device=None):
@@ -3223,6 +3248,16 @@ class CTCLayer(LayerBase):
         config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')
 
 
+@config_layer('kmax_seq_score')
+class KmaxSeqScoreLayer(LayerBase):
+    def __init__(self, name, inputs, beam_size, **xargs):
+        super(KmaxSeqScoreLayer, self).__init__(
+            name, 'kmax_seq_score', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1, 'KmaxSeqScoreLayer has only one input.')
+        self.config.beam_size = beam_size
+
+
 @config_layer('warp_ctc')
 class WarpCTCLayer(LayerBase):
     def __init__(self,
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index ea5fdcc50f6abbc67fb61b7fd56c100d9f9811d0..1bc55c869601551aff5fc0311458f906385522d2 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -129,8 +129,10 @@ __all__ = [
     'prelu_layer',
     'gated_unit_layer',
     'crop_layer',
+    'sub_nested_seq_layer',
     'clip_layer',
     'slice_projection',
+    'kmax_sequence_score_layer',
 ]
 
 
@@ -224,8 +226,11 @@ class LayerType(object):
 
     PRELU = 'prelu'
     CROP_LAYER = 'crop'
+    SUB_NESTED_SEQ = 'sub_nested_seq'
     CLIP_LAYER = 'clip'
 
+    KMAX_SEQ_SCORE = 'kmax_seq_score'
+
     @staticmethod
     def is_layer_type(type_name):
         """
@@ -6088,6 +6093,53 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
         size=l.config.size)
 
 
+@wrap_name_default()
+@layer_support()
+def sub_nested_seq_layer(input, selected_indices, name=None):
+    """
+    The sub_nested_seq_layer accepts two inputs: the first one is a nested
+    sequence; the second one is a set of selceted indices in the nested sequence.
+
+    Then sub_nest_seq_layer trims the first nested sequence input according
+    to the selected indices to form a new output. This layer is useful in
+    beam training.
+
+    The example usage is:
+
+    .. code-block:: python
+
+        sub_nest_seq = sub_nested_seq_layer(input=[data, selected_indices])
+
+
+    :param input: A nested sequence.
+    :type input: LayerOutput
+    :param selected_indices: a set of sequence indices in the nested sequence.
+    :type input: LayerOutput
+    :param name: name of this layer.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of '
+        'sub_nested_seq_layer must be a Paddle layer.')
+    assert isinstance(selected_indices, LayerOutput), (
+        'The second input of '
+        'sub_nested_seq_layer must be a Paddle layer.')
+
+    l = Layer(
+        inputs=input.name,
+        selected_indices=selected_indices.name,
+        name=name,
+        type=LayerType.SUB_NESTED_SEQ)
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.SUB_NESTED_SEQ,
+        parents=input,
+        size=l.config.size)
+
+
 @wrap_name_default("clip")
 def clip_layer(input, min, max, name=None):
     """
@@ -6109,7 +6161,8 @@ def clip_layer(input, min, max, name=None):
     :type min: double
     :param max: The upper threshold for clipping.
     :type max: double
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     Layer(
         name=name,
@@ -6119,3 +6172,41 @@ def clip_layer(input, min, max, name=None):
         max=max)
     return LayerOutput(
         name, LayerType.CLIP_LAYER, parents=[input], size=input.size)
+
+
+@wrap_name_default()
+@layer_support()
+def kmax_sequence_score_layer(input, name=None, beam_size=1):
+    """
+    This layer accepts one input which are scores over a sequence or a nested
+    sequence, and returns indices of beam_size sequences with highest scores.
+
+    .. code-block:: python
+
+        kmax_indices = kmax_sequence_score_layer(input=input_layer, beam_size)
+
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer. It stores scores over a sequence or a nested
+        sequence and its size must be 1.
+    :type input: LayerOutput.
+    :param beam_size: squence indices with top beam_size scores are returned.
+    :type beam_size: double
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput), ("kmax_sequence_score_layer "
+                                            "accepts only one input.")
+    assert input.size == 1, (
+        "input of kmax_sequence_score_layer is a score"
+        "over a sequence or a nested sequence, so its width must be 1.")
+
+    Layer(
+        name=name,
+        type=LayerType.KMAX_SEQ_SCORE,
+        inputs=[input.name],
+        beam_size=beam_size)
+
+    return LayerOutput(
+        name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 0ffa58bc1e2088f75e7cd25c7ecdffbe270825a4..a61beb871ad064c617fa141451afcb2a5ac64854 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -7,6 +7,7 @@ test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
-test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer)
+test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
+test_kmax_seq_socre_layer test_seq_select_layers)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..81bd71f68eb3f2c04ccd46ee3b77a07543395c60
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
@@ -0,0 +1,66 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "data"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 1
+  active_type: "exponential"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__kmax_sequence_score_layer_0__"
+  type: "kmax_seq_score"
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  beam_size: 5
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 128
+  initial_mean: 0.0
+  initial_std: 0.0883883476483
+  dims: 128
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__kmax_sequence_score_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "data"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__kmax_sequence_score_layer_0__"
+  input_layer_names: "data"
+  output_layer_names: "__kmax_sequence_score_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..4b906b113e3c0569d5576127e100d097e4923436
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr
@@ -0,0 +1,37 @@
+type: "nn"
+layers {
+  name: "input_seq"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "input"
+  type: "data"
+  size: 5
+  active_type: ""
+}
+layers {
+  name: "__sub_nested_seq_layer_0__"
+  type: "sub_nested_seq"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input_seq"
+  }
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input_seq"
+output_layer_names: "__sub_nested_seq_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input_seq"
+  layer_names: "input"
+  layer_names: "__sub_nested_seq_layer_0__"
+  input_layer_names: "input_seq"
+  output_layer_names: "__sub_nested_seq_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d245c5a41c793e1f02f306bfe64071bd9885906e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+#coding=utf-8
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+
+data = data_layer(name="data", size=128)
+scores = fc_layer(input=data, size=1, act=ExpActivation())
+kmax_seq_id = kmax_sequence_score_layer(input=scores, beam_size=5)
+
+outputs(kmax_seq_id)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d1c3175ba9801d69f3f9cb9e754858253192270
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+#coding=utf-8
+from paddle.trainer_config_helpers import *
+
+beam_size = 5
+
+data = data_layer(name='input_seq', size=300)
+selected_ids = data_layer(name='input', size=beam_size)
+sub_nest_seq = sub_nested_seq_layer(input=data, selected_indices=selected_ids)
+
+outputs(sub_nest_seq)
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 541639ac21661529b0b1f2cc8d8fa25605052c8c..10659caa882fd3d4060f9947413a392c3b681ee8 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -13,6 +13,7 @@ py_test(test_protobuf SRCS test_protobuf.py)
 py_test(test_add_two_op SRCS test_add_two_op.py)
 py_test(test_sigmoid_op SRCS test_sigmoid_op.py)
 py_test(test_softmax_op SRCS test_softmax_op.py)
+py_test(test_cross_entropy_op SRCS test_cross_entropy_op.py)
 py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py)
 
 py_test(gradient_checker SRCS gradient_checker.py)
@@ -21,3 +22,4 @@ py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)
 
 py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
 py_test(test_operator SRCS test_operator.py)
+py_test(test_uniform_random_op SRCS test_uniform_random_op.py)
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index cfd29932f5b46920815819c5a75d62a0138e21a2..b73c4869d14a62a951d8e45dafb14b7523355519 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -1,16 +1,31 @@
+import unittest
+
+import numpy
 import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator
-import numpy
-import unittest
 
 __all__ = ['get_numeric_gradient']
 
 
+def create_op(op_type):
+    kwargs = dict()
+    for in_name in Operator.get_op_input_names(op_type):
+        kwargs[in_name] = in_name
+    for out_name in Operator.get_op_output_names(op_type):
+        kwargs[out_name] = out_name
+
+    return Operator(op_type, **kwargs)
+
+
+def grad_var_name(var_name):
+    return var_name + "@GRAD"
+
+
 def get_numeric_gradient(op,
                          input_values,
                          output_name,
                          input_to_check,
-                         delta=1e-2,
+                         delta=0.005,
                          local_scope=None):
     """
     Get Numeric Gradient for an operator's input.
@@ -76,6 +91,113 @@ def get_numeric_gradient(op,
     return gradient_flat.reshape(tensor_to_check.get_dims())
 
 
+class GradientChecker(unittest.TestCase):
+    def __is_close(self, numeric_grads, scope, max_relative_error):
+        for name in numeric_grads:
+            op_grad = numpy.array(
+                scope.find_var(grad_var_name(name)).get_tensor())
+            is_close = numpy.allclose(
+                numeric_grads[name], op_grad, rtol=max_relative_error, atol=100)
+            if not is_close:
+                return False
+        return True
+
+    def check_grad(self,
+                   forward_op,
+                   input_vars,
+                   inputs_to_check,
+                   output_name,
+                   no_grad_set=None,
+                   only_cpu=False,
+                   max_relative_error=0.005):
+        """
+        :param forward_op: used to create backward_op
+        :param input_vars: numpy value of input variable. The following
+            computation will use these variables.
+        :param inputs_to_check: inputs var names that should check gradient.
+        :param output_name: output name that used to
+        :param max_relative_error: The relative tolerance parameter.
+        :param no_grad_set: used when create backward ops
+        :param only_cpu: only compute and check gradient on cpu kernel.
+        :return:
+        """
+        if no_grad_set is None:
+            no_grad_set = set()
+
+        tmp_outs = forward_op.temp_outputs()
+        no_tmp_out = filter(lambda name: name not in tmp_outs,
+                            forward_op.outputs())
+        if len(no_tmp_out) != 1:
+            raise ValueError("non temp out_names should be 1")
+
+        in_names = forward_op.inputs()
+        for no_grad in no_grad_set:
+            if no_grad not in in_names:
+                raise ValueError("no_grad should be in in_names")
+
+        backward_op = core.Operator.backward(forward_op, no_grad_set)
+
+        places = [core.CPUPlace()]
+        if not only_cpu and core.is_compile_gpu() and backward_op.support_gpu():
+            places.append(core.GPUPlace(0))
+
+        numeric_grad = dict()
+        # get numeric gradient
+        for check_name in inputs_to_check:
+            numeric_grad[check_name] = \
+                get_numeric_gradient(forward_op, input_vars, output_name, check_name)
+
+        # get operator gradient according to different device
+        for place in places:
+            scope = core.Scope()
+            ctx = core.DeviceContext.create(place)
+
+            # create input var and set value
+            for name, value in input_vars.iteritems():
+                if name not in in_names:
+                    raise ValueError(name + " not in op.inputs_")
+                var = scope.new_var(name).get_tensor()
+                var.set_dims(value.shape)
+                var.set(value, place)
+
+            # create output var
+            for out_name in forward_op.outputs():
+                scope.new_var(out_name).get_tensor()
+
+            # infer the shape of output var and compute/set value of output var
+            forward_op.infer_shape(scope)
+            forward_op.run(scope, ctx)
+
+            # create output grad var
+            # set shape as the output var
+            # set value of this grad to ones
+            for name in forward_op.outputs():
+                out_tensor = scope.find_var(name).get_tensor()
+                grad_tensor = scope.new_var(grad_var_name(name)).get_tensor()
+                grad_tensor.set_dims(out_tensor.shape())
+                data = 1.0 * numpy.ones(out_tensor.shape())
+                grad_tensor.set(data, place)
+
+            # create input grad var
+            for name in backward_op.outputs():
+                scope.new_var(name).get_tensor()
+
+            # infer the shape of input gradient var and compute/set it's value
+            # with backward op
+            backward_op.infer_shape(scope)
+            backward_op.run(scope, ctx)
+
+            if isinstance(place, core.CPUPlace):
+                msg = "CPU kernel gradient is not close to numeric gradient"
+            else:
+                if isinstance(place, core.GPUPlace):
+                    msg = "GPU kernel gradient is not close to numeric gradient"
+                else:
+                    raise ValueError("unknown place " + type(place))
+            self.assertTrue(
+                self.__is_close(numeric_grad, scope, max_relative_error), msg)
+
+
 if __name__ == '__main__':
 
     class GetNumericGradientTest(unittest.TestCase):
@@ -87,4 +209,28 @@ if __name__ == '__main__':
             arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X')
             self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-2)
 
+        def test_softmax_op(self):
+            def stable_softmax(x):
+                """Compute the softmax of vector x in a numerically stable way."""
+                shiftx = x - numpy.max(x)
+                exps = numpy.exp(shiftx)
+                return exps / numpy.sum(exps)
+
+            def label_softmax_grad(Y, dY):
+                dX = Y * 0.0
+                for i in range(Y.shape[0]):
+                    d = numpy.dot(Y[i, :], dY[i, :])
+                    dX[i, :] = Y[i, :] * (dY[i, :] - d)
+                return dX
+
+            softmax_op = Operator("softmax", X="X", Y="Y")
+
+            X = numpy.random.random((2, 2)).astype("float32")
+            Y = numpy.apply_along_axis(stable_softmax, 1, X)
+            dY = numpy.ones(Y.shape)
+            dX = label_softmax_grad(Y, dY)
+
+            arr = get_numeric_gradient(softmax_op, {"X": X}, 'Y', 'X')
+            numpy.testing.assert_almost_equal(arr, dX, decimal=1e-2)
+
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
index da6bed0fcd690d5a7f53f44d0181c75f12e5d074..dd65e0f2dc23d3f657ff16c55fb297dae210b2d7 100644
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -1,6 +1,5 @@
-import paddle.v2.framework.core as core
-import unittest
 import numpy
+import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator
 
 
@@ -24,7 +23,7 @@ class OpTestMeta(type):
             scope = core.Scope()
             kwargs = dict()
             places = [core.CPUPlace()]
-            if core.is_compile_gpu() and core.Operator.support_gpu(self.type):
+            if core.is_compile_gpu():
                 places.append(core.GPUPlace(0))
 
             for place in places:
@@ -53,6 +52,8 @@ class OpTestMeta(type):
                         kwargs[attr_name] = self.attrs[attr_name]
 
                 op = Operator(self.type, **kwargs)
+                if isinstance(place, core.GPUPlace) and not op.support_gpu():
+                    return
 
                 op.infer_shape(scope)
 
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index b26e25d58b59bd1cb16e9ba2a1cccd27799b15f2..4815192e255c6e0429db3f50918a76a773b30131 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -1,9 +1,10 @@
 import unittest
 import numpy
 from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
 
 
-class TestSGD(unittest.TestCase):
+class TestCrossEntropy(unittest.TestCase):
     __metaclass__ = OpTestMeta
 
     def setUp(self):
@@ -20,7 +21,18 @@ class TestSGD(unittest.TestCase):
         self.outputs = {'Y': numpy.array(Y).astype("float32")}
 
 
-# TODO(superjom) add gradient check
+class CrossEntropyGradOpTest(GradientChecker):
+    def test_softmax_grad(self):
+        op = create_op("onehot_cross_entropy")
+        batch_size = 100
+        class_num = 10
+        inputs = {
+            "X": numpy.random.uniform(
+                0.1, 1.0, [batch_size, class_num]).astype("float32"),
+            "label": (class_num / 2) * numpy.ones(batch_size).astype("int32")
+        }
+        self.check_grad(op, inputs, set("X"), "Y")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/framework/tests/test_softmax_op.py
index d20e085b8e43488480edf07b6cd4edcd861883f3..e670d93653e07d35e5019c9daac45c214eddf367 100644
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
@@ -1,9 +1,8 @@
 import unittest
 
 import numpy as np
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
 
+from gradient_checker import GradientChecker, create_op
 from op_test_util import OpTestMeta
 
 
@@ -25,62 +24,11 @@ class TestSoftmaxOp(unittest.TestCase):
         }
 
 
-class TestSoftmaxGradOp(unittest.TestCase):
-    def test_softmax_grad(self):
-        op = Operator('softmax', X="X", Y="Y")
-        backward_op = core.Operator.backward(op, set())
-        self.assertEqual(backward_op.type(), "softmax_grad")
-        expected = '''Op(softmax_grad), inputs:(X, Y, Y@GRAD), outputs:(X@GRAD).'''
-        self.assertEqual(expected, str(backward_op))
-
-        batch_size = 3
-        class_num = 5
-        # Initialize X and add 1e-2 for numerical stability
-        Y = np.random.rand(batch_size, class_num).astype(np.float32)
-        Y = Y + 1e-2
-        dY = np.random.rand(batch_size, class_num).astype(np.float32)
-
-        # Reference implementation of cross entropy with soft labels
-        def label_softmax_grad(Y, dY):
-            dX = Y * 0.0
-            for i in range(batch_size):
-                d = np.dot(Y[i, :], dY[i, :])
-                dX[i, :] = Y[i, :] * (dY[i, :] - d)
-            return dX
-
-        expected = label_softmax_grad(Y, dY)
-
-        scope = core.Scope()
-        places = []
-        places.append(core.CPUPlace())
-        if core.is_compile_gpu():
-            places.append(core.GPUPlace(0))
-
-        for place in places:
-            y = scope.new_var("Y")
-            y_tensor = y.get_tensor()
-            y_tensor.set_dims([batch_size, class_num])
-            y_tensor.alloc_float(place)
-            y_tensor.set(Y, place)
-
-            dy = scope.new_var("Y@GRAD")
-            dy_tensor = dy.get_tensor()
-            dy_tensor.set_dims([batch_size, class_num])
-            dy_tensor.alloc_float(place)
-            dy_tensor.set(dY, place)
-
-            x = scope.new_var("X")
-            dx = scope.new_var("X@GRAD")
-
-            tensor = scope.find_var("X@GRAD").get_tensor()
-            backward_op.infer_shape(scope)
-            self.assertEqual([batch_size, class_num], tensor.shape())
-
-            ctx = core.DeviceContext.create(place)
-            backward_op.run(scope, ctx)
-            actual = np.array(tensor)
-
-            np.testing.assert_almost_equal(actual, expected, decimal=3)
+class SoftmaxGradOpTest(GradientChecker):
+    def test_softmax(self):
+        op = create_op("softmax")
+        inputs = {"X": np.random.uniform(0.1, 1, [10, 10]).astype("float32")}
+        self.check_grad(op, inputs, set("X"), "Y")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/framework/tests/test_uniform_random_op.py b/python/paddle/v2/framework/tests/test_uniform_random_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3d2bb44da3977c0899b2609a8efe15b7e1789f2
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
@@ -0,0 +1,35 @@
+import unittest
+from paddle.v2.framework.op import Operator
+import paddle.v2.framework.core as core
+import numpy
+
+
+class UniformRandomTest(unittest.TestCase):
+    def test_uniform_random_cpu(self):
+        self.uniform_random_test(place=core.CPUPlace())
+
+    def test_uniform_random_gpu(self):
+        if core.is_compile_gpu():
+            self.uniform_random_test(place=core.GPUPlace(0))
+
+    def uniform_random_test(self, place):
+        scope = core.Scope()
+        scope.new_var("X").get_tensor()
+
+        op = Operator(
+            "uniform_random",
+            Out="X",
+            dims=[1000, 784],
+            min=-5.0,
+            max=10.0,
+            seed=10)
+
+        op.infer_shape(scope)
+        ctx = core.DeviceContext.create(place)
+        op.run(scope, ctx)
+        tensor = numpy.array(scope.find_var("X").get_tensor())
+        self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()